diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 528ce8709..6894372fe 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -77,7 +77,7 @@ maybe an issue for your problem already exists, and the discussion
might inform you of workarounds readily available.
We want to fix all the issues as soon as possible, but before fixing a
-bug we need to reproduce and confirm it. In order to reproduce bugs we
+bug, we need to reproduce and confirm it. In order to reproduce bugs, we
will systematically ask you to provide a minimal reproduction scenario
using the custom issue template.
@@ -90,15 +90,14 @@ and accept your changes.
* Update the documentation so all of your changes are reflected there.
* Adhere to [PEP 8](https://peps.python.org/pep-0008/) standards.
-* Use a maximum of 91 characters per line. Try to keep docstrings below
+* Use a maximum of 99 characters per line. Try to keep docstrings below
74 characters.
* Update the project unit tests to test your code changes as thoroughly
as possible.
* Make sure that your code is properly commented with docstrings and
comments explaining your rationale behind non-obvious coding practices.
* Run [isort](https://pycqa.github.io/isort/): `isort atom tests`.
-* Run [flake8](https://github.com/pycqa/flake8): `flake8 --show-source --statistics atom tests`.
-* Run [pydocstyle](https://www.pydocstyle.org/en/stable/): `pydocstyle atom tests`.
+* Run [ruff](https://docs.astral.sh/ruff/): ` ruff check --fix atom tests`.
* Run [mypy](https://www.mypy-lang.org/): `mypy atom tests`.
If your contribution requires a new library dependency:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 16552c06f..97fc37311 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,27 +1,25 @@
-ci:
- autoupdate_schedule: monthly
-
repos:
- repo: https://github.com/pycqa/isort
- rev: 5.11.4
+ rev: 5.12.0
hooks:
- id: isort
files: ^atom/.*\.py$|tests/.*\.py$
- - repo: https://github.com/pycqa/flake8
- rev: 6.0.0
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.7
hooks:
- - id: flake8
- additional_dependencies: [flake8-pyproject]
- files: ^atom/.*\.py$|tests/.*\.py$
- args: ["--show-source", "--statistics"]
+ - id: ruff
+ types_or: [ python, pyi, jupyter ]
+ args: ["--fix"]
+ files: ^atom/.*\.py$|tests/.*\.py$
- - repo: https://github.com/pycqa/pydocstyle
- rev: 6.3.0
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
hooks:
- - id: pydocstyle
- additional_dependencies: [tomli]
- files: ^atom/.*\.py$|tests/.*\.py$
+ - id: check-yaml
+ - id: end-of-file-fixer
+ - id: mixed-line-ending
+ - id: check-merge-conflict
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.7.1
diff --git a/atom/__init__.py b/atom/__init__.py
index 0cb76f6ae..4d88f9f34 100644
--- a/atom/__init__.py
+++ b/atom/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/api.py b/atom/api.py
index db565c40e..efd6b30dd 100644
--- a/atom/api.py
+++ b/atom/api.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -244,21 +242,22 @@ class ATOMClassifier(ATOM):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -356,7 +355,7 @@ def __init__(
holdout_size: Scalar | None = None,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -480,21 +479,22 @@ class ATOMForecaster(ATOM):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -585,7 +585,7 @@ def __init__(
holdout_size: Scalar | None = None,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -719,21 +719,22 @@ class ATOMRegressor(ATOM):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -830,7 +831,7 @@ def __init__(
holdout_size: Scalar | None = None,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
diff --git a/atom/atom.py b/atom/atom.py
index 2ffc5e474..9fe1c685e 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -88,7 +86,8 @@ class ATOM(BaseRunner, ATOMPlot, metaclass=ABCMeta):
@property
@abstractmethod
- def _goal(self) -> Goal: ...
+ def _goal(self) -> Goal:
+ ...
def __init__(
self,
@@ -103,7 +102,7 @@ def __init__(
holdout_size: Scalar | None = None,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -152,7 +151,9 @@ def __init__(
elif self.backend != "loky":
self._log(
"Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to make use "
- f"of the {self.backend} parallelization backend.", 1, severity="warning"
+ f"of the {self.backend} parallelization backend.",
+ 1,
+ severity="warning",
)
if "cpu" not in self.device.lower():
self._log(f"Device: {self.device}", 1)
@@ -265,8 +266,8 @@ def branch(self):
self._branches.branches.remove(current)
self._branches.current = self._branches[0].name
self._log(
- f"Branch {current} successfully deleted. "
- f"Switched to branch {self.branch.name}.", 1
+ f"Branch {current} successfully deleted. Switched to branch {self.branch.name}.",
+ 1,
)
@property
@@ -357,7 +358,7 @@ def outliers(self) -> pd.Series:
"""
if not is_sparse(self.X):
data = self.branch.train.select_dtypes(include=["number"])
- z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
+ z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3
z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns)
return z_scores[z_scores > 0]
@@ -372,7 +373,7 @@ def n_outliers(self) -> Int:
"""
if not is_sparse(self.X):
data = self.branch.train.select_dtypes(include=["number"])
- z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
+ z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3
return z_scores.any(axis=1).sum()
raise AttributeError("This property is unavailable for sparse datasets.")
@@ -495,8 +496,8 @@ def distribution(
stat = stats.kstest(X, dist, args=param)
# Add as column to the dataframe
- df.at[(dist, "score"), col] = round(stat[0], 4)
- df.at[(dist, "p_value"), col] = round(stat[1], 4)
+ df.loc[(dist, "score"), col] = round(stat[0], 4)
+ df.loc[(dist, "p_value"), col] = round(stat[1], 4)
return df
@@ -687,7 +688,8 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
# Reassign the transformer attributes (warnings random_state, etc...)
BaseTransformer.__init__(
- atom, **{x: getattr(atom, x) for x in BaseTransformer.attrs},
+ atom,
+ **{x: getattr(atom, x) for x in BaseTransformer.attrs},
)
if data is not None:
@@ -727,8 +729,8 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
if atom._config.index is False:
branch._container = DataContainer(
data=(dataset := branch._container.data.reset_index(drop=True)),
- train_idx=dataset.index[:len(branch._container.train_idx)],
- test_idx=dataset.index[-len(branch._container.test_idx):],
+ train_idx=dataset.index[: len(branch._container.train_idx)],
+ test_idx=dataset.index[-len(branch._container.test_idx) :],
n_cols=branch._container.n_cols,
)
@@ -741,7 +743,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
return atom
@composed(crash, method_to_log)
- def reset(self, hard: Bool = False):
+ def reset(self, *, hard: Bool = False):
"""Reset the instance to it's initial state.
Deletes all branches and models. The dataset is also reset
@@ -970,7 +972,8 @@ def stats(self, _vb: Int = -2, /):
duplicates = None
self._log(
"Unable to calculate the number of duplicate "
- "rows because a column is unhashable.", 3
+ "rows because a column is unhashable.",
+ 3,
)
if not self.X.empty:
@@ -1082,6 +1085,7 @@ def _prepare_kwargs(
def _add_transformer(
self,
transformer: T_Transformer,
+ *,
columns: ColumnSelector | None = None,
train_only: Bool = False,
**fit_params,
@@ -1146,7 +1150,9 @@ def _add_transformer(
"Features and target columns passed to transformer "
f"{transformer_c.__class__.__name__}. Either select features or "
"the target column, not both at the same time. The transformation "
- "of the target column will be ignored.", 1, severity="warning"
+ "of the target column will be ignored.",
+ 1,
+ severity="warning",
)
transformer_c._cols = inc
@@ -1202,8 +1208,8 @@ def _add_transformer(
if self._config.index is False:
self.branch._container = DataContainer(
data=(data := self.dataset.reset_index(drop=True)),
- train_idx=data.index[:len(self.branch._data.train_idx)],
- test_idx=data.index[-len(self.branch._data.test_idx):],
+ train_idx=data.index[: len(self.branch._data.train_idx)],
+ test_idx=data.index[-len(self.branch._data.test_idx) :],
n_cols=self.branch._data.n_cols,
)
if self.branch._holdout is not None:
@@ -1307,13 +1313,22 @@ def add(
"""
if isinstance(transformer, SkPipeline):
# Recursively add all transformers to the pipeline
- for name, est in transformer.named_steps.items():
+ for est in transformer.named_steps.values():
self._log(f"Adding {est.__class__.__name__} to the pipeline...", 1)
- self._add_transformer(est, columns, train_only, **fit_params)
+ self._add_transformer(
+ transformer=est,
+ columns=columns,
+ train_only=train_only,
+ **fit_params,
+ )
else:
- self._log(
- f"Adding {transformer.__class__.__name__} to the pipeline...", 1)
- self._add_transformer(transformer, columns, train_only, **fit_params)
+ self._log(f"Adding {transformer.__class__.__name__} to the pipeline...", 1)
+ self._add_transformer(
+ transformer=transformer,
+ columns=columns,
+ train_only=train_only,
+ **fit_params,
+ )
@composed(crash, method_to_log)
def apply(
@@ -1640,6 +1655,7 @@ def prune(
def scale(
self,
strategy: ScalerStrats = "standard",
+ *,
include_binary: Bool = False,
**kwargs,
):
@@ -2036,7 +2052,8 @@ def _run(self, trainer: BaseRunner):
self._delete_models(model.name)
self._log(
f"Consecutive runs of model {model.name}. "
- "The former model has been overwritten.", 1
+ "The former model has been overwritten.",
+ 1,
)
self._models.extend(trainer._models)
diff --git a/atom/basemodel.py b/atom/basemodel.py
index eb71acbed..980fcf314 100644
--- a/atom/basemodel.py
+++ b/atom/basemodel.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -116,21 +114,22 @@ class BaseModel(RunnerPlot):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -192,9 +191,10 @@ def __init__(
config: DataConfig | None = None,
branches: BranchManager | None = None,
metric: ClassMap | None = None,
+ *,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -270,9 +270,7 @@ def __getattr__(self, item: str) -> Any:
elif item in DF_ATTRS:
return getattr(self.branch.dataset, item) # Get attr from dataset
- raise AttributeError(
- f"'{self.__class__.__name__}' object has no attribute '{item}'."
- )
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'.")
def __contains__(self, item: str) -> bool:
"""Whether the item is a column in the dataset."""
@@ -539,7 +537,7 @@ def _fit_estimator(
trial.params[self.has_validation] = f"{step}/{steps}"
trial.set_user_attr("estimator", estimator)
- raise TrialPruned()
+ raise TrialPruned
else:
# Add the forecasting horizon to sktime estimators
@@ -592,10 +590,7 @@ def _final_output(self) -> str:
try:
if self._bootstrap is None:
out = " ".join(
- [
- f"{met}: {rnd(self._best_score(met))}"
- for met in self._metric.keys()
- ]
+ [f"{met}: {rnd(self._best_score(met))}" for met in self._metric.keys()]
)
else:
out = " ".join(
@@ -762,7 +757,10 @@ def _score_from_pred(
# Forecasting models can have first prediction NaN
if self.task.is_forecast and all(x.isna()[0] for x in get_cols(y_pred)):
- y_true, y_pred, = y_true.iloc[1:], y_pred.iloc[1:]
+ (
+ y_true,
+ y_pred,
+ ) = y_true.iloc[1:], y_pred.iloc[1:]
if self.task is Task.multiclass_multioutput_classification:
# Get the mean of the scores over the target columns
@@ -842,7 +840,7 @@ def _get_score(
return result
@composed(crash, method_to_log, beartype)
- def hyperparameter_tuning(self, n_trials: Int, reset: Bool = False):
+ def hyperparameter_tuning(self, n_trials: Int, *, reset: Bool = False):
"""Run the hyperparameter tuning algorithm.
Search for the best combination of hyperparameters. The function
@@ -962,7 +960,8 @@ def fit_model(
estimator = self._get_est(self._est_params | self._trial_to_est(params))
# Check if the same parameters have already been evaluated
- for t in trial.study.get_trials(False, states=(TrialState.COMPLETE,))[::-1]:
+ past_t = trial.study.get_trials(deepcopy=False, states=(TrialState.COMPLETE,))
+ for t in past_t[::-1]:
if trial.params == t.params:
# Get same estimator and score as previous evaluation
estimator = deepcopy(t.user_attrs["estimator"])
@@ -1049,20 +1048,15 @@ def fit_model(
)
elif exc:
# If distributions were excluded with `!`, select all but those
- self._ht["distributions"] = {
- k: v for k, v in dist.items() if k not in exc
- }
+ self._ht["distributions"] = {k: v for k, v in dist.items() if k not in exc}
elif inc:
- self._ht["distributions"] = {
- k: v for k, v in dist.items() if k in inc
- }
+ self._ht["distributions"] = {k: v for k, v in dist.items() if k in inc}
else:
self._ht["distributions"] = dist
# Drop hyperparameter if already defined in est_params
self._ht["distributions"] = {
- k: v for k, v in self._ht["distributions"].items()
- if k not in self._est_params
+ k: v for k, v in self._ht["distributions"].items() if k not in self._est_params
}
# If no hyperparameters to optimize, skip ht
@@ -1071,9 +1065,7 @@ def fit_model(
return
if not self._study or reset:
- kw: dict[str, Any] = {
- k: v for k, v in self._ht.items() if k in sign(create_study)
- }
+ kw: dict[str, Any] = {k: v for k, v in self._ht.items() if k in sign(create_study)}
if len(self._metric) == 1:
kw["direction"] = "maximize"
@@ -1097,7 +1089,7 @@ def fit_model(
else:
plot_callback = None
- callbacks = kw.pop("callbacks", []) + [TrialsCallback(self, n_jobs)]
+ callbacks = [*kw.pop("callbacks", []), TrialsCallback(self, n_jobs)]
callbacks += [plot_callback] if plot_callback else []
self._study.optimize(
@@ -1113,7 +1105,9 @@ def fit_model(
self._study = None
self._log(
"The study didn't complete any trial successfully. "
- "Skipping hyperparameter tuning.", 1, severity="warning"
+ "Skipping hyperparameter tuning.",
+ 1,
+ severity="warning",
)
return
@@ -1197,10 +1191,7 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
# Mlflow only accepts params with char length <250
mlflow.log_params(
- {
- k: v for k, v in self.estimator.get_params().items()
- if len(str(v)) <= 250
- }
+ {k: v for k, v in self.estimator.get_params().items() if len(str(v)) <= 250}
)
# Save evals for models with in-training validation
@@ -1241,7 +1232,7 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
)
@composed(crash, method_to_log, beartype)
- def bootstrapping(self, n_bootstrap: Int, reset: Bool = False):
+ def bootstrapping(self, n_bootstrap: Int, *, reset: Bool = False):
"""Apply a bootstrap algorithm.
Take bootstrapped samples from the training set and test them
@@ -1292,8 +1283,7 @@ def bootstrapping(self, n_bootstrap: Int, reset: Bool = False):
self._log(f"Bootstrap {'-' * 39}", 1)
out = [
- f"{m.name}: {rnd(self.bootstrap.mean()[i])}"
- f" \u00B1 {rnd(self.bootstrap.std()[i])}"
+ f"{m.name}: {rnd(self.bootstrap.mean()[i])} \u00B1 {rnd(self.bootstrap.std()[i])}"
for i, m in enumerate(self._metric)
]
self._log(f"Evaluation --> {' '.join(out)}", 1)
@@ -1321,7 +1311,7 @@ def name(self, value: str):
"""Change the model's name."""
# Drop the acronym if provided by the user
if re.match(f"{self.acronym}_", value, re.I):
- value = value[len(self.acronym) + 1:]
+ value = value[len(self.acronym) + 1 :]
# Add the acronym in front (with right capitalization)
self._name = f"{self.acronym}{f'_{value}' if value else ''}"
@@ -1529,7 +1519,7 @@ def results(self) -> pd.Series:
if self._study is not None:
for met in self._metric.keys():
data[f"{met}_ht"] = self.trials.loc[self.best_trial.number, met]
- data["time_ht"] = self.trials.iat[-1, -2]
+ data["time_ht"] = self.trials.iloc[-1, -2]
for met in self._metric:
for ds in ("train", "test"):
data[f"{met.name}_{ds}"] = self._get_score(met, ds)
@@ -1610,7 +1600,7 @@ def pipeline(self) -> Pipeline:
"""
if self.scaler:
return Pipeline(
- steps=self.branch.pipeline.steps + [("AutomatedScaler", self.scaler)],
+ steps=[*self.branch.pipeline.steps, ("AutomatedScaler", self.scaler)],
memory=self.memory,
)
else:
@@ -1637,8 +1627,8 @@ def holdout(self) -> DataFrame | None:
if (holdout := self.branch.holdout) is not None:
if self.scaler:
return merge(
- self.scaler.transform(holdout.iloc[:, :-self.branch._data.n_cols]),
- holdout.iloc[:, -self.branch._data.n_cols:],
+ self.scaler.transform(holdout.iloc[:, : -self.branch._data.n_cols]),
+ holdout.iloc[:, -self.branch._data.n_cols :],
)
else:
return holdout
@@ -1659,14 +1649,14 @@ def y(self) -> Pandas:
def X_train(self) -> DataFrame:
"""Features of the training set."""
if self.scaler:
- return self.scaler.transform(self.branch.X_train[-self._train_idx:])
+ return self.scaler.transform(self.branch.X_train[-self._train_idx :])
else:
- return self.branch.X_train[-self._train_idx:]
+ return self.branch.X_train[-self._train_idx :]
@property
def y_train(self) -> Pandas:
"""Target column of the training set."""
- return self.branch.y_train[-self._train_idx:]
+ return self.branch.y_train[-self._train_idx :]
@property
def X_test(self) -> DataFrame:
@@ -1680,7 +1670,7 @@ def X_test(self) -> DataFrame:
def X_holdout(self) -> DataFrame | None:
"""Features of the holdout set."""
if self.holdout is not None:
- return self.holdout.iloc[:, :-self.branch._data.n_cols]
+ return self.holdout.iloc[:, : -self.branch._data.n_cols]
else:
return None
@@ -1828,9 +1818,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]:
**{k: v for k, v in kwargs.items() if k in sign(Interface)},
)
- self.app.launch(
- **{k: v for k, v in kwargs.items() if k in sign(Interface.launch)}
- )
+ self.app.launch(**{k: v for k, v in kwargs.items() if k in sign(Interface.launch)})
@available_if(has_task("!multioutput"))
@composed(crash, method_to_log, beartype)
@@ -1898,7 +1886,7 @@ def create_dashboard(
# Explainer expects a list of np.array with shap values for each class
exp.values = list(np.moveaxis(exp.values, -1, 0))
- params = dict(permutation_metric=self._metric, n_jobs=self.n_jobs)
+ params = {"permutation_metric": self._metric, "n_jobs": self.n_jobs}
if self.task.is_classification:
explainer = ClassifierExplainer(self.estimator, X, y, **params)
else:
@@ -2232,6 +2220,7 @@ def register(
self,
name: str | None = None,
stage: Stages = "None",
+ *,
archive_existing_versions: Bool = False,
):
"""Register the model in [mlflow's model registry][registry].
@@ -2436,7 +2425,8 @@ def _prediction(
sample_weight: Sequence[Scalar] | None = ...,
verbose: Int | None = ...,
method: Literal["score"] = ...,
- ) -> Float: ...
+ ) -> Float:
+ ...
@overload
def _prediction(
@@ -2447,7 +2437,8 @@ def _prediction(
sample_weight: Sequence[Scalar] | None = ...,
verbose: Int | None = ...,
method: PredictionMethods = ...,
- ) -> Pandas: ...
+ ) -> Pandas:
+ ...
def _prediction(
self,
@@ -2556,7 +2547,7 @@ def assign_prediction_columns() -> list[str]:
if self.scaler:
Xt = self.scaler.transform(Xt)
- except Exception:
+ except Exception: # noqa: BLE001
Xt, yt = get_transform_X_y(X, y)
if method != "score":
@@ -2832,7 +2823,8 @@ def _prediction(
verbose: Int | None = None,
method: Literal["score"] = ...,
**kwargs,
- ) -> Float: ...
+ ) -> Float:
+ ...
@overload
def _prediction(
@@ -2843,7 +2835,8 @@ def _prediction(
verbose: Int | None = None,
method: PredictionMethodsTS = ...,
**kwargs,
- ) -> Pandas: ...
+ ) -> Pandas:
+ ...
def _prediction(
self,
@@ -3049,7 +3042,7 @@ def predict_quantiles(
fh: FHSelector,
X: XSelector | None = None,
*,
- alpha: Float | list[Float] = [0.05, 0.95],
+ alpha: Float | Sequence[Float] = (0.05, 0.95),
verbose: Int | None = None,
) -> DataFrame:
"""Get probabilistic forecasts on new data or existing rows.
@@ -3069,7 +3062,7 @@ def predict_quantiles(
X: hashable, segment, sequence, dataframe-like or None, default=None
Exogenous time series corresponding to `fh`.
- alpha: float or list of float, default=[0.05, 0.95]
+ alpha: float or sequence, default=(0.05, 0.95)
A probability or list of, at which quantile forecasts are
computed.
diff --git a/atom/baserunner.py b/atom/baserunner.py
index f25deba16..b776749ac 100644
--- a/atom/baserunner.py
+++ b/atom/baserunner.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -67,7 +65,9 @@ def __setstate__(self, state: dict[str, Any]):
self._log(
f"The loaded instance used the {key} package with version "
f"{versions[key]} while the version in this environment is "
- f"{value}.", 1, severity="warning"
+ f"{value}.",
+ 1,
+ severity="warning",
)
def __getattr__(self, item: str) -> Any:
@@ -83,9 +83,7 @@ def __getattr__(self, item: str) -> Any:
elif item in DF_ATTRS:
return getattr(self.branch.dataset, item) # Get attr from dataset
else:
- raise AttributeError(
- f"'{self.__class__.__name__}' object has no attribute '{item}'."
- )
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'.")
def __setattr__(self, item: str, value: Any):
"""Set attr to branch when it's a property of Branch."""
@@ -192,9 +190,7 @@ def winners(self) -> list[Model] | None:
"""
if self._models: # Returns None if not fitted
- return sorted(
- self._models, key=lambda x: (x._best_score(), x._time_fit), reverse=True
- )
+ return sorted(self._models, key=lambda x: (x._best_score(), x._time_fit), reverse=True)
else:
return None
@@ -485,9 +481,9 @@ def _no_data_sets(
complete_set = self._set_index(bk.concat([train, test, holdout]), y)
container = DataContainer(
- data=(data := complete_set.iloc[:len(data)]),
- train_idx=data.index[:-len(test)],
- test_idx=data.index[-len(test):],
+ data=(data := complete_set.iloc[: len(data)]),
+ train_idx=data.index[: -len(test)],
+ test_idx=data.index[-len(test) :],
n_cols=len(get_cols(y)),
)
@@ -499,12 +495,12 @@ def _no_data_sets(
"columns, which results in a least populated class that has only "
"one member. Either select only one column to stratify over, or "
"set the parameter stratify=False."
- )
+ ) from ex
else:
raise ex
if holdout is not None:
- holdout = complete_set.iloc[len(data):]
+ holdout = complete_set.iloc[len(data) :]
return container, holdout
@@ -590,22 +586,22 @@ def _has_data_sets(
f"index ({len(self._config.index)}) doesn't match "
f"that of the data sets ({len_data})."
)
- train.index = self._config.index[:len(train)]
- test.index = self._config.index[len(train):len(train) + len(test)]
+ train.index = self._config.index[: len(train)]
+ test.index = self._config.index[len(train) : len(train) + len(test)]
if holdout is not None:
- holdout.index = self._config.index[-len(holdout):]
+ holdout.index = self._config.index[-len(holdout) :]
complete_set = self._set_index(bk.concat([train, test, holdout]), y_test)
container = DataContainer(
- data=(data := complete_set.iloc[:len(train) + len(test)]),
- train_idx=data.index[:len(train)],
- test_idx=data.index[-len(test):],
+ data=(data := complete_set.iloc[: len(train) + len(test)]),
+ train_idx=data.index[: len(train)],
+ test_idx=data.index[-len(test) :],
n_cols=len(get_cols(y_train)),
)
if holdout is not None:
- holdout = complete_set.iloc[len(train) + len(test):]
+ holdout = complete_set.iloc[len(train) + len(test) :]
return container, holdout
@@ -624,61 +620,59 @@ def _has_data_sets(
return self.branch._data, self.branch._holdout
elif len(arrays) == 1:
- # arrays=(X,) or arrays=(y,) for forecasting
+ # X or y for forecasting
sets = _no_data_sets(*self._check_input(arrays[0], y=y))
elif len(arrays) == 2:
if isinstance(arrays[0], tuple) and len(arrays[0]) == len(arrays[1]) == 2:
- # arrays=((X_train, y_train), (X_test, y_test))
+ # (X_train, y_train), (X_test, y_test)
X_train, y_train = self._check_input(arrays[0][0], arrays[0][1])
X_test, y_test = self._check_input(arrays[1][0], arrays[1][1])
sets = _has_data_sets(X_train, y_train, X_test, y_test)
elif isinstance(arrays[1], (*int_t, str)) or n_cols(arrays[1]) == 1:
if not self._goal.name == "forecast":
- # arrays=(X, y)
+ # X, y
sets = _no_data_sets(*self._check_input(arrays[0], arrays[1]))
else:
- # arrays=(train, test) for forecast
+ # train, test for forecast
X_train, y_train = self._check_input(y=arrays[0])
X_test, y_test = self._check_input(y=arrays[1])
sets = _has_data_sets(X_train, y_train, X_test, y_test)
else:
- # arrays=(train, test)
+ # train, test
X_train, y_train = self._check_input(arrays[0], y=y)
X_test, y_test = self._check_input(arrays[1], y=y)
sets = _has_data_sets(X_train, y_train, X_test, y_test)
elif len(arrays) == 3:
if len(arrays[0]) == len(arrays[1]) == len(arrays[2]) == 2:
- # arrays=((X_train, y_train), (X_test, y_test), (X_holdout, y_holdout))
+ # (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)
X_train, y_train = self._check_input(arrays[0][0], arrays[0][1])
X_test, y_test = self._check_input(arrays[1][0], arrays[1][1])
X_hold, y_hold = self._check_input(arrays[2][0], arrays[2][1])
sets = _has_data_sets(X_train, y_train, X_test, y_test, X_hold, y_hold)
else:
- # arrays=(train, test, holdout)
+ # train, test, holdout
X_train, y_train = self._check_input(arrays[0], y=y)
X_test, y_test = self._check_input(arrays[1], y=y)
X_hold, y_hold = self._check_input(arrays[2], y=y)
sets = _has_data_sets(X_train, y_train, X_test, y_test, X_hold, y_hold)
elif len(arrays) == 4:
- # arrays=(X_train, X_test, y_train, y_test)
+ # X_train, X_test, y_train, y_test
X_train, y_train = self._check_input(arrays[0], arrays[2])
X_test, y_test = self._check_input(arrays[1], arrays[3])
sets = _has_data_sets(X_train, y_train, X_test, y_test)
elif len(arrays) == 6:
- # arrays=(X_train, X_test, X_holdout, y_train, y_test, y_holdout)
+ # X_train, X_test, X_holdout, y_train, y_test, y_holdout
X_train, y_train = self._check_input(arrays[0], arrays[3])
X_test, y_test = self._check_input(arrays[1], arrays[4])
X_hold, y_hold = self._check_input(arrays[2], arrays[5])
sets = _has_data_sets(X_train, y_train, X_test, y_test, X_hold, y_hold)
else:
- raise ValueError(
- "Invalid data arrays. See the documentation for the allowed formats."
- )
+ raise ValueError("Invalid data arrays. See the documentation for the allowed formats.")
if self._goal.name == "forecast":
# For forecasting, check if index complies with sktime's standard
@@ -704,6 +698,7 @@ def _has_data_sets(
def _get_models(
self,
models: ModelsSelector = None,
+ *,
ensembles: Bool = True,
branch: Branch | None = None,
) -> list[Model]:
@@ -748,7 +743,7 @@ def _get_models(
raise IndexError(
f"Invalid value for the models parameter. Value {model} is "
f"out of range. There are {len(self._models)} models."
- )
+ ) from None
elif isinstance(model, str):
for mdl in model.split("+"):
array = inc
@@ -856,7 +851,7 @@ def available_models(self) -> pd.DataFrame:
"native_multilabel": m.native_multilabel,
"native_multioutput": m.native_multioutput,
"has_validation": bool(m.has_validation),
- "supports_engines": ", ". join(m.supports_engines),
+ "supports_engines": ", ".join(m.supports_engines),
}
)
@@ -946,18 +941,12 @@ def evaluate(
"""
check_is_fitted(self, attributes="_models")
- evaluations = []
- for m in self._models:
- evaluations.append(
- m.evaluate(
- metric=metric,
- rows=rows,
- threshold=threshold,
- sample_weight=sample_weight,
- )
- )
-
- return pd.DataFrame(evaluations)
+ return pd.DataFrame(
+ [
+ m.evaluate(metric, rows, threshold=threshold, sample_weight=sample_weight)
+ for m in self._models
+ ]
+ )
@composed(crash, beartype)
def export_pipeline(self, model: str | Model | None = None) -> Pipeline:
@@ -1146,11 +1135,11 @@ def save(self, filename: str | Path = "auto", *, save_data: Bool = True):
if (og := self._branches.og).name not in self._branches:
self._branches._og._container = None
for branch in self._branches:
- data[branch.name] = dict(
- _data=deepcopy(branch._container),
- _holdout=deepcopy(branch._holdout),
- holdout=branch.__dict__.pop("holdout", None) # Clear cached holdout
- )
+ data[branch.name] = {
+ "_data": deepcopy(branch._container),
+ "_holdout": deepcopy(branch._holdout),
+ "holdout": branch.__dict__.pop("holdout", None), # Clear cached holdout
+ }
branch._container = None
branch._holdout = None
diff --git a/atom/basetracker.py b/atom/basetracker.py
index ec9786e77..f7b17e810 100644
--- a/atom/basetracker.py
+++ b/atom/basetracker.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/basetrainer.py b/atom/basetrainer.py
index d506f069b..c027c7c2a 100644
--- a/atom/basetrainer.py
+++ b/atom/basetrainer.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -45,9 +43,25 @@ class BaseTrainer(BaseRunner, RunnerPlot, metaclass=ABCMeta):
"""
def __init__(
- self, models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory, verbose,
- warnings, logger, experiment, random_state,
+ self,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
):
super().__init__(
n_jobs=n_jobs,
@@ -111,7 +125,7 @@ def _check_param(self, param: str, value: Any) -> dict:
"should be equal to the number of models, got len"
f"(models)={len(self._models)} and len({param})={len(value)}."
)
- return {k: v for k, v in zip(lst(self.models), value)}
+ return dict(zip(lst(self.models), value, strict=True))
elif not isinstance(value, dict):
return {k: value for k in lst(self.models)}
@@ -185,11 +199,14 @@ def _prepare_parameters(self):
f"Invalid value for the models parameter, got {m}. "
"Note that tags must be separated by an underscore. "
f"Available model are:\n"
- "\n".join([
- f" --> {m.__name__} ({m.acronym})"
- for m in MODELS if self._goal.name in m._estimators
- ])
- )
+ "\n".join(
+ [
+ f" --> {m.__name__} ({m.acronym})"
+ for m in MODELS
+ if self._goal.name in m._estimators
+ ]
+ )
+ ) from None
# Check if libraries for non-sklearn models are available
dependencies = {
@@ -204,10 +221,7 @@ def _prepare_parameters(self):
# Check if the model supports the task
if self._goal.name not in cls._estimators:
# Forecast task can use regression models
- if (
- self._goal.name == "forecast"
- and "regression" in cls._estimators
- ):
+ if self._goal.name == "forecast" and "regression" in cls._estimators:
kwargs["goal"] = Goal.Regression
else:
raise ValueError(
@@ -239,7 +253,8 @@ def _prepare_parameters(self):
self._models = ClassMap(*inc)
else:
self._models = ClassMap(
- model(**kwargs) for model in MODELS
+ model(**kwargs)
+ for model in MODELS
if self._goal.name in model._estimators and model.acronym not in exc
)
@@ -348,7 +363,7 @@ def execute_model(m: Model) -> Model | None:
return m
- except Exception as ex:
+ except Exception as ex: # noqa: BLE001
self._log(f"\nException encountered while running the {m.name} model.", 1)
self._log("".join(traceback.format_tb(ex.__traceback__))[:-1], 3)
self._log(f"{ex.__class__.__name__}: {ex}", 1)
diff --git a/atom/basetransformer.py b/atom/basetransformer.py
index 0e8d32c3c..bec4cf08e 100644
--- a/atom/basetransformer.py
+++ b/atom/basetransformer.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -69,7 +67,7 @@ class BaseTransformer:
"""
- attrs = [
+ attrs = (
"n_jobs",
"device",
"engine",
@@ -80,7 +78,7 @@ class BaseTransformer:
"logger",
"experiment",
"random_state",
- ]
+ )
def __init__(self, **kwargs):
"""Update the properties with the provided kwargs."""
@@ -122,7 +120,10 @@ def engine(self) -> Engine:
@engine.setter
@beartype
- def engine(self, value: Engine):
+ def engine(self, value: Engine | None):
+ if value is None:
+ value = {"data": "numpy", "estimator": "sklearn"}
+
if value.get("data") == "modin" and not ray.is_initialized():
ray.init(
runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_Pandas__": "1"}},
@@ -141,6 +142,7 @@ def engine(self, value: Engine):
)
else:
import sklearnex
+
sklearnex.set_config(self.device.lower() if self._gpu else "auto")
elif value.get("estimator") == "cuml":
if not find_spec("cuml"):
@@ -150,10 +152,12 @@ def engine(self, value: Engine):
)
else:
from cuml.common.device_selection import set_global_device_type
+
set_global_device_type("gpu" if self._gpu else "cpu")
# See https://github.com/rapidsai/cuml/issues/5564
from cuml.internals.memory_utils import set_global_output_type
+
set_global_output_type("numpy")
self._engine = value
@@ -272,7 +276,7 @@ def logger(self, value: str | Path | Logger | None):
fh.setFormatter(Formatter("%(asctime)s - %(levelname)s: %(message)s"))
# Redirect loggers to file handler
- for name in [logger.name] + external_loggers:
+ for name in [logger.name, *external_loggers]:
getLogger(name).addHandler(fh)
self._logger = logger
@@ -298,6 +302,7 @@ def experiment(self, value: str | None):
username = requests.get(
url="https://dagshub.com/api/v1/user",
auth=HTTPBearerAuth(token),
+ timeout=5,
).json()["username"]
if f"{username}/{value}" not in os.getenv("MLFLOW_TRACKING_URI", ""):
@@ -322,7 +327,7 @@ def random_state(self, value: IntLargerEqualZero | None):
value = int(value)
random.seed(value)
- np.random.seed(value)
+ np.random.seed(value) # noqa: NPY002
self._random_state = value
@property
@@ -343,7 +348,7 @@ def _device_id(self) -> int:
f"Invalid value for the device parameter. GPU device {value[-1]} "
"isn't understood. Use a single integer to denote a specific "
"device. Note that ATOM doesn't support multi-GPU training."
- )
+ ) from None
# Methods ====================================================== >>
@@ -404,7 +409,8 @@ def _check_input(
y: Literal[None],
columns: Axes,
name: Literal[None],
- ) -> tuple[DataFrame, None]: ...
+ ) -> tuple[DataFrame, None]:
+ ...
@staticmethod
@overload
@@ -413,7 +419,8 @@ def _check_input(
y: YSelector,
columns: Literal[None],
name: str | Sequence[str],
- ) -> tuple[None, Pandas]: ...
+ ) -> tuple[None, Pandas]:
+ ...
@staticmethod
@overload
@@ -422,7 +429,8 @@ def _check_input(
y: YSelector,
columns: Axes | None = ...,
name: str | Sequence[str] | None = ...,
- ) -> tuple[DataFrame, Pandas]: ...
+ ) -> tuple[DataFrame, Pandas]:
+ ...
@staticmethod
def _check_input(
@@ -499,7 +507,7 @@ def _check_input(
raise ValueError(
f"The features are different than seen at fit time. "
f"Features {set(Xt.columns) - set(columns)} are missing in X."
- )
+ ) from None
# Prepare target column
if isinstance(y, (dict, *sequence_t, *dataframe_t)):
@@ -532,7 +540,7 @@ def _check_input(
raise ValueError(
"X and y don't have the same number of rows,"
f" got len(X)={len(Xt)} and len(y)={len(y)}."
- )
+ ) from None
else:
yt = y
@@ -586,9 +594,9 @@ def _log(self, msg: str, level: Int = 0, severity: Severity = "info"):
if severity in ("error", "critical"):
raise UserWarning(msg)
elif severity == "warning":
- warnings.warn(msg, category=UserWarning)
+ warnings.warn(msg, category=UserWarning, stacklevel=2)
elif severity == "info" and self.verbose >= level:
- print(msg)
+ print(msg) # noqa: T201
if self.logger:
for text in str(msg).split("\n"):
diff --git a/atom/branch/__init__.py b/atom/branch/__init__.py
index e6bf178fa..dd6f3adc1 100644
--- a/atom/branch/__init__.py
+++ b/atom/branch/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/branch/branch.py b/atom/branch/branch.py
index 3179b8084..68cee48b9 100644
--- a/atom/branch/branch.py
+++ b/atom/branch/branch.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -330,7 +328,7 @@ def holdout(self) -> DataFrame | None:
if self._holdout is not None:
return merge(
*self.pipeline.transform(
- X=self._holdout.iloc[:, :-self._data.n_cols],
+ X=self._holdout.iloc[:, : -self._data.n_cols],
y=self._holdout[self.target],
)
)
@@ -415,7 +413,7 @@ def n_columns(self) -> Int:
@property
def features(self) -> Index:
"""Name of the features."""
- return self.columns[:-self._data.n_cols]
+ return self.columns[: -self._data.n_cols]
@property
def n_features(self) -> Int:
@@ -425,7 +423,7 @@ def n_features(self) -> Int:
@property
def target(self) -> str | list[str]:
"""Name of the target column(s)."""
- return flt(list(self.columns[-self._data.n_cols:]))
+ return flt(list(self.columns[-self._data.n_cols :]))
@property
def _all(self) -> DataFrame:
@@ -443,19 +441,24 @@ def _all(self) -> DataFrame:
def _get_rows(
self,
rows: RowSelector,
+ *,
return_X_y: Literal[False] = ...,
- ) -> DataFrame: ...
+ ) -> DataFrame:
+ ...
@overload
def _get_rows(
self,
rows: RowSelector,
+ *,
return_X_y: Literal[True],
- ) -> tuple[DataFrame, Pandas]: ...
+ ) -> tuple[DataFrame, Pandas]:
+ ...
def _get_rows(
self,
rows: RowSelector,
+ *,
return_X_y: Bool = False,
) -> DataFrame | tuple[DataFrame, Pandas]:
"""Get a subset of the rows.
@@ -541,13 +544,14 @@ def _get_rows(
inc = list(_all.index[~_all.index.isin(exc)])
if return_X_y:
- return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore
+ return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index]
else:
return self._all.loc[inc]
def _get_columns(
self,
columns: ColumnSelector | None = None,
+ *,
include_target: Bool = True,
only_numerical: Bool = False,
) -> list[str]:
@@ -641,19 +645,24 @@ def _get_columns(
def _get_target(
self,
target: TargetsSelector,
+ *,
only_columns: Literal[False] = ...,
- ) -> tuple[int, int]: ...
+ ) -> tuple[int, int]:
+ ...
@overload
def _get_target(
self,
target: TargetsSelector,
+ *,
only_columns: Literal[True],
- ) -> str: ...
+ ) -> str:
+ ...
def _get_target(
self,
target: TargetsSelector,
+ *,
only_columns: Bool = False,
) -> str | tuple[int, int]:
"""Get a target column and/or class in target column.
@@ -735,7 +744,7 @@ def get_class(
raise ValueError(
f"Invalid value for the target parameter. Value {target} "
"not found in the mapping of the target column."
- )
+ ) from None
else:
n_classes = get_cols(self.y)[column].nunique(dropna=False)
if not 0 <= target < n_classes:
@@ -767,7 +776,7 @@ def get_class(
else:
return 0, get_class(target)
- def load(self, assign: Bool = True) -> DataContainer | None:
+ def load(self, *, assign: Bool = True) -> DataContainer | None:
"""Load the branch's data from memory.
This method is used to restore the data of inactive branches.
@@ -788,7 +797,7 @@ def load(self, assign: Bool = True) -> DataContainer | None:
with open(self._location.joinpath(f"{self}.pkl"), "rb") as file:
data = pickle.load(file)
except FileNotFoundError:
- raise FileNotFoundError(f"Branch {self.name} has no data stored.")
+ raise FileNotFoundError(f"Branch {self.name} has no data stored.") from None
if assign:
self._container = data
@@ -797,7 +806,7 @@ def load(self, assign: Bool = True) -> DataContainer | None:
return self._container
- def store(self, assign: Bool = True):
+ def store(self, *, assign: Bool = True):
"""Store the branch's data as a pickle in memory.
After storage, the data is deleted, and the branch is no longer
@@ -819,7 +828,9 @@ def store(self, assign: Bool = True):
with open(self._location.joinpath(f"{self}.pkl"), "wb") as file:
pickle.dump(self._container, file)
except FileNotFoundError:
- raise FileNotFoundError(f"The {self._location} directory does not exist.")
+ raise FileNotFoundError(
+ f"The {self._location} directory does not exist."
+ ) from None
if assign:
self._container = None
diff --git a/atom/branch/branchmanager.py b/atom/branch/branchmanager.py
index 2baeb7778..0d2a36f7d 100644
--- a/atom/branch/branchmanager.py
+++ b/atom/branch/branchmanager.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -120,7 +118,7 @@ def __getitem__(self, item: Int | str) -> Branch:
except KeyError:
raise IndexError(
f"This {self.__class__.__name__} instance has no branch {item}."
- )
+ ) from None
@property
def og(self) -> Branch:
@@ -168,18 +166,18 @@ def _copy_from_parent(branch: Branch, parent: Branch):
# Transfer data from memory to avoid having
# the datasets in memory twice at one time
parent.store()
- setattr(branch, "_container", parent.load(assign=False))
+ branch._container = parent.load(assign=False)
else:
# Copy the dataset in-memory
- setattr(branch, "_container", deepcopy(parent._container))
+ branch._container = deepcopy(parent._container)
# Deepcopy the pipeline but use the same estimators
- setattr(branch, "_pipeline", deepcopy(getattr(parent, "_pipeline")))
+ branch._pipeline = deepcopy(parent._pipeline)
for i, step in enumerate(parent._pipeline.steps):
branch.pipeline.steps[i] = step
# Copy mapping and assign other vars
- setattr(branch, "_mapping", copy(getattr(parent, "_mapping")))
+ branch._mapping = copy(parent._mapping)
for attr in vars(parent):
if not hasattr(branch, attr): # If not already assigned...
setattr(branch, attr, getattr(parent, attr))
@@ -229,7 +227,7 @@ def fill(self, data: DataContainer, holdout: DataFrame | None = None):
self.current._container = data
self.current._holdout = holdout
- def reset(self, hard: Bool = False):
+ def reset(self, *, hard: Bool = False):
"""Reset this instance to its initial state.
The initial state of the BranchManager contains a single branch
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index 8108985e6..fb77d60d8 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -378,29 +376,29 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
else:
self.target_names_in_ = np.array([y.name])
- strategies = dict(
- # clustercentroids=ClusterCentroids, # Has no sample_indices_
- condensednearestneighbour=CondensedNearestNeighbour,
- editednearestneighborus=EditedNearestNeighbours,
- repeatededitednearestneighbours=RepeatedEditedNearestNeighbours,
- allknn=AllKNN,
- instancehardnessthreshold=InstanceHardnessThreshold,
- nearmiss=NearMiss,
- neighbourhoodcleaningrule=NeighbourhoodCleaningRule,
- onesidedselection=OneSidedSelection,
- randomundersampler=RandomUnderSampler,
- tomeklinks=TomekLinks,
- randomoversampler=RandomOverSampler,
- smote=SMOTE,
- smotenc=SMOTENC,
- smoten=SMOTEN,
- adasyn=ADASYN,
- borderlinesmote=BorderlineSMOTE,
- kmeanssmote=KMeansSMOTE,
- svmsmote=SVMSMOTE,
- smoteenn=SMOTEENN,
- smotetomek=SMOTETomek,
- )
+ strategies = {
+ # clustercentroids=ClusterCentroids, # noqa: ERA001 (has no sample_indices_)
+ "condensednearestneighbour": CondensedNearestNeighbour,
+ "editednearestneighborus": EditedNearestNeighbours,
+ "repeatededitednearestneighbours": RepeatedEditedNearestNeighbours,
+ "allknn": AllKNN,
+ "instancehardnessthreshold": InstanceHardnessThreshold,
+ "nearmiss": NearMiss,
+ "neighbourhoodcleaningrule": NeighbourhoodCleaningRule,
+ "onesidedselection": OneSidedSelection,
+ "randomundersampler": RandomUnderSampler,
+ "tomeklinks": TomekLinks,
+ "randomoversampler": RandomOverSampler,
+ "smote": SMOTE,
+ "smotenc": SMOTENC,
+ "smoten": SMOTEN,
+ "adasyn": ADASYN,
+ "borderlinesmote": BorderlineSMOTE,
+ "kmeanssmote": KMeansSMOTE,
+ "svmsmote": SVMSMOTE,
+ "smoteenn": SMOTEENN,
+ "smotetomek": SMOTETomek,
+ }
if isinstance(self.strategy, str):
if self.strategy.lower() not in strategies:
@@ -529,9 +527,9 @@ def log_changes(y):
]
# Select the new samples and assign the new indices
- X_new = X_new.iloc[-len(X_new) + len(o_samples):]
+ X_new = X_new.iloc[-len(X_new) + len(o_samples) :]
X_new.index = n_idx
- y_new = y_new.iloc[-len(y_new) + len(o_samples):]
+ y_new = y_new.iloc[-len(y_new) + len(o_samples) :]
y_new.index = n_idx
# First, output the samples created
@@ -706,7 +704,7 @@ def __init__(
drop_missing_target: Bool = True,
encode_target: Bool = True,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
):
@@ -781,9 +779,7 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
elif list(uq := np.unique(col)) != list(range(col.nunique())):
LabelEncoder = self._get_est_class("LabelEncoder", "preprocessing")
self._estimators[col.name] = LabelEncoder().fit(col)
- self.mapping_.update(
- {col.name: {str(it(v)): i for i, v in enumerate(uq)}}
- )
+ self.mapping_.update({col.name: {str(it(v)): i for i, v in enumerate(uq)}})
return self
@@ -834,8 +830,8 @@ def transform(
# Drop features with an invalid data type
if dtype in lst(self.drop_dtypes):
self._log(
- f" --> Dropping feature {name} for "
- f"having a prohibited type: {dtype}.", 2
+ f" --> Dropping feature {name} for having a prohibited type: {dtype}.",
+ 2,
)
X = X.drop(columns=name)
continue
@@ -1134,7 +1130,7 @@ def __init__(
bins: Bins = 5,
labels: Sequence[str] | dict[str, Sequence[str]] | None = None,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
random_state: IntLargerEqualZero | None = None,
@@ -1235,7 +1231,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
"Invalid value for the bins parameter. The length of the "
"bins does not match the length of the columns, got len"
f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}."
- )
+ ) from None
else:
bins_x = bins_c
@@ -1267,7 +1263,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
"a sequence of bin edges is accepted when strategy='custom'."
)
else:
- bins_c = [-np.inf] + list(bins_c) + [np.inf]
+ bins_c = [-np.inf, *bins_c, np.inf]
FunctionTransformer = self._get_est_class(
name="FunctionTransformer",
@@ -1512,20 +1508,20 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._to_value = {}
self._categories = {}
- strategies = dict(
- backwarddifference=BackwardDifferenceEncoder,
- basen=BaseNEncoder,
- binary=BinaryEncoder,
- catboost=CatBoostEncoder,
- helmert=HelmertEncoder,
- jamesstein=JamesSteinEncoder,
- mestimate=MEstimateEncoder,
- ordinal=OrdinalEncoder,
- polynomial=PolynomialEncoder,
- sum=SumEncoder,
- target=TargetEncoder,
- woe=WOEEncoder,
- )
+ strategies = {
+ "backwarddifference": BackwardDifferenceEncoder,
+ "basen": BaseNEncoder,
+ "binary": BinaryEncoder,
+ "catboost": CatBoostEncoder,
+ "helmert": HelmertEncoder,
+ "jamesstein": JamesSteinEncoder,
+ "mestimate": MEstimateEncoder,
+ "ordinal": OrdinalEncoder,
+ "polynomial": PolynomialEncoder,
+ "sum": SumEncoder,
+ "target": TargetEncoder,
+ "woe": WOEEncoder,
+ }
if isinstance(self.strategy, str):
if self.strategy.lower().endswith("encoder"):
@@ -1578,8 +1574,9 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._log(
f" --> The number of classes passed to feature {name} in the "
f"ordinal parameter ({len(ordinal_c)}) don't match the number "
- f"of classes in the data ({column.nunique(dropna=True)}).", 1,
- severity="warning"
+ f"of classes in the data ({column.nunique(dropna=True)}).",
+ 1,
+ severity="warning",
)
# Create custom mapping from 0 to N - 1
@@ -1660,7 +1657,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(
f" --> {estimator.__class__.__name__[:-7]}-encoding feature "
- f"{name}. Contains {X[name].nunique()} classes.", 2
+ f"{name}. Contains {X[name].nunique()} classes.",
+ 2,
)
# Count the propagated missing values
@@ -1801,7 +1799,7 @@ class Imputer(TransformerMixin):
# Add some random missing values to the data
for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):
- X.iat[i, j] = np.NaN
+ X.iloc[i, j] = np.NaN
atom = ATOMClassifier(X, y, random_state=1)
print(atom.nans)
@@ -1841,7 +1839,7 @@ def __init__(
max_nan_cols: FloatLargerZero | None = None,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
random_state: IntLargerEqualZero | None = None,
@@ -2013,7 +2011,8 @@ def transform(
if diff := length - len(X):
self._log(
f" --> Dropping {diff} samples for containing more "
- f"than {self._max_nan_rows} missing values.", 2
+ f"than {self._max_nan_rows} missing values.",
+ 2,
)
if self.strat_num == "drop":
@@ -2022,7 +2021,8 @@ def transform(
if diff := length - len(X):
self._log(
f" --> Dropping {diff} samples for containing "
- f"missing values in numerical columns.", 2
+ f"missing values in numerical columns.",
+ 2,
)
if self.strat_cat == "drop":
@@ -2031,7 +2031,8 @@ def transform(
if diff := length - len(X):
self._log(
f" --> Dropping {diff} samples for containing "
- f"missing values in categorical columns.", 2
+ f"missing values in categorical columns.",
+ 2,
)
# Print imputation information per feature
@@ -2041,7 +2042,8 @@ def transform(
if name not in self._estimator.feature_names_in_:
self._log(
f" --> Dropping feature {name}. Contains {nans} "
- f"({nans * 100 // len(X)}%) missing values.", 2
+ f"({nans * 100 // len(X)}%) missing values.",
+ 2,
)
X = X.drop(columns=name)
continue
@@ -2049,30 +2051,35 @@ def transform(
if self.strat_num != "drop" and name in num_imputer.feature_names_in_:
if not isinstance(self.strat_num, str):
self._log(
- f" --> Imputing {nans} missing values with number "
- f"'{str(self.strat_num)}' in feature {name}.", 2
+ f" --> Imputing {nans} missing values with "
+ f"number '{self.strat_num}' in feature {name}.",
+ 2,
)
elif self.strat_num in ("knn", "iterative"):
self._log(
f" --> Imputing {nans} missing values using "
- f"the {self.strat_num} imputer in feature {name}.", 2
+ f"the {self.strat_num} imputer in feature {name}.",
+ 2,
)
elif self.strat_num != "drop": # mean, median or most_frequent
self._log(
f" --> Imputing {nans} missing values with {self.strat_num} "
f"({np.round(get_stat(num_imputer, name), 2)}) in feature "
- f"{name}.", 2
+ f"{name}.",
+ 2,
)
elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_:
if self.strat_cat == "most_frequent":
self._log(
f" --> Imputing {nans} missing values with most_frequent "
- f"({get_stat(cat_imputer, name)}) in feature {name}.", 2
+ f"({get_stat(cat_imputer, name)}) in feature {name}.",
+ 2,
)
elif self.strat_cat != "drop":
self._log(
f" --> Imputing {nans} missing values with value "
- f"'{self.strat_cat}' in feature {name}.", 2
+ f"'{self.strat_cat}' in feature {name}.",
+ 2,
)
X = self._estimator.transform(X)
@@ -2219,7 +2226,7 @@ def __init__(
strategy: NormalizerStrats = "yeojohnson",
*,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
random_state: IntLargerEqualZero | None = None,
@@ -2253,11 +2260,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
- strategies = dict(
- yeojohnson="PowerTransformer",
- boxcox="PowerTransformer",
- quantile="QuantileTransformer",
- )
+ strategies = {
+ "yeojohnson": "PowerTransformer",
+ "boxcox": "PowerTransformer",
+ "quantile": "QuantileTransformer",
+ }
if self.strategy in ("yeojohnson", "boxcox"):
estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
@@ -2395,21 +2402,22 @@ class Pruner(TransformerMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -2494,7 +2502,7 @@ def __init__(
max_sigma: FloatLargerZero = 3,
include_target: Bool = False,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
**kwargs,
@@ -2541,18 +2549,18 @@ def transform(
"""
# Estimators with their modules
- strategies = dict(
- iforest=["IsolationForest", "ensemble"],
- ee=["EllipticEnvelope", "covariance"],
- lof=["LocalOutlierFactor", "neighbors"],
- svm=["OneClassSVM", "svm"],
- dbscan=["DBSCAN", "cluster"],
- hdbscan=["HDBSCAN", "cluster"],
- optics=["OPTICS", "cluster"],
- )
+ strategies = {
+ "iforest": ["IsolationForest", "ensemble"],
+ "ee": ["EllipticEnvelope", "covariance"],
+ "lof": ["LocalOutlierFactor", "neighbors"],
+ "svm": ["OneClassSVM", "svm"],
+ "dbscan": ["DBSCAN", "cluster"],
+ "hdbscan": ["HDBSCAN", "cluster"],
+ "optics": ["OPTICS", "cluster"],
+ }
for strat in lst(self.strategy):
- if strat not in ["zscore"] + list(strategies):
+ if strat not in ["zscore", *strategies]:
raise ValueError(
"Invalid value for the strategy parameter. "
f"Choose from: zscore, {', '.join(strategies)}."
@@ -2591,8 +2599,8 @@ def transform(
cond = np.abs(z_scores) > self.max_sigma
objective = objective.mask(cond, self.method)
self._log(
- f" --> Replacing {cond.sum()} outlier "
- f"values with {self.method}.", 2
+ f" --> Replacing {cond.sum()} outlier values with {self.method}.",
+ 2,
)
elif self.method.lower() == "minmax":
@@ -2614,7 +2622,8 @@ def transform(
self._log(
f" --> Replacing {counts} outlier values "
- "with the min or max of the column.", 2
+ "with the min or max of the column.",
+ 2,
)
elif self.method.lower() == "drop":
@@ -2623,7 +2632,8 @@ def transform(
if len(lst(self.strategy)) > 1:
self._log(
f" --> The zscore strategy detected "
- f"{len(mask) - sum(mask)} outliers.", 2
+ f"{len(mask) - sum(mask)} outliers.",
+ 2,
)
else:
@@ -2633,7 +2643,8 @@ def transform(
if len(lst(self.strategy)) > 1:
self._log(
f" --> The {estimator.__class__.__name__} "
- f"detected {len(mask) - sum(mask)} outliers.", 2
+ f"detected {len(mask) - sum(mask)} outliers.",
+ 2,
)
# Add the estimator as attribute to the instance
@@ -2641,7 +2652,7 @@ def transform(
if outliers:
# Select outliers from intersection of strategies
- mask = [any([i for i in strats]) for strats in zip(*outliers)]
+ mask = [any(strats) for strats in zip(*outliers, strict=True)]
self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2)
# Keep only the non-outliers from the data
@@ -2775,10 +2786,10 @@ class Scaler(TransformerMixin):
def __init__(
self,
strategy: ScalerStrats = "standard",
- include_binary: Bool = False,
*,
+ include_binary: Bool = False,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
**kwargs,
@@ -2811,12 +2822,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
if not self.include_binary:
num_cols = [c for c in num_cols if ~np.isin(X[c].unique(), [0, 1]).all()]
- strategies = dict(
- standard="StandardScaler",
- minmax="MinMaxScaler",
- maxabs="MaxAbsScaler",
- robust="RobustScaler",
- )
+ strategies = {
+ "standard": "StandardScaler",
+ "minmax": "MinMaxScaler",
+ "maxabs": "MaxAbsScaler",
+ "robust": "RobustScaler",
+ }
estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
self._estimator = estimator(**self.kwargs)
diff --git a/atom/ensembles.py b/atom/ensembles.py
index 0aa6c7ed3..564b50339 100644
--- a/atom/ensembles.py
+++ b/atom/ensembles.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -41,7 +39,7 @@ def _get_fitted_attrs(self):
# Uses 'drop' as placeholder for dropped estimators
est_iter = iter(self.estimators_)
for name, est in self.estimators:
- if est == "drop" or check_is_fitted(est, False):
+ if est == "drop" or check_is_fitted(est, exception=False):
self.named_estimators_[name] = est
else:
self.named_estimators_[name] = next(est_iter)
@@ -96,14 +94,14 @@ def fit(
message=self._log_message(names[idx], idx + 1, len(all_estimators)),
)
for idx, clf in enumerate(all_estimators)
- if clf != "drop" and not check_is_fitted(clf, False)
+ if clf != "drop" and not check_is_fitted(clf, exception=False)
)
self.estimators_ = []
estimators = iter(estimators)
for est in self.estimators:
if est[1] != "drop":
- if check_is_fitted(est[1], False):
+ if check_is_fitted(est[1], exception=False):
self.estimators_.append(est[1])
else:
self.estimators_.append(next(estimators))
@@ -156,14 +154,14 @@ def fit(
estimators = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_single_estimator)(clone(clf), X, y, sample_weight)
for idx, clf in enumerate(all_estimators)
- if clf != "drop" and not check_is_fitted(clf, False)
+ if clf != "drop" and not check_is_fitted(clf, exception=False)
)
self.estimators_ = []
estimators = iter(estimators)
for est in self.estimators:
if est[1] != "drop":
- if check_is_fitted(est[1], False):
+ if check_is_fitted(est[1], exception=False):
self.estimators_.append(est[1])
else:
self.estimators_.append(next(estimators))
@@ -182,11 +180,9 @@ def fit(
self.stack_method_ = [
self._method_name(name, est, meth)
- for name, est, meth in zip(names, all_estimators, stack_method)
+ for name, est, meth in zip(names, all_estimators, stack_method, strict=True)
]
- fit_params = (
- {"sample_weight": sample_weight} if sample_weight is not None else None
- )
+ fit_params = {"sample_weight": sample_weight} if sample_weight is not None else None
predictions = Parallel(n_jobs=self.n_jobs)(
delayed(cross_val_predict)(
@@ -199,21 +195,20 @@ def fit(
fit_params=fit_params,
verbose=self.verbose,
)
- for est, meth in zip(all_estimators, self.stack_method_)
+ for est, meth in zip(all_estimators, self.stack_method_, strict=True)
if est != "drop"
)
# Only not None or not 'drop' estimators will be used in transform.
# Remove the None from the method as well.
self.stack_method_ = [
- meth for (meth, est) in zip(self.stack_method_, all_estimators)
+ meth
+ for (meth, est) in zip(self.stack_method_, all_estimators, strict=True)
if est != "drop"
]
X_meta = self._concatenate_predictions(X, predictions)
- _fit_single_estimator(
- self.final_estimator_, X_meta, y, sample_weight=sample_weight
- )
+ _fit_single_estimator(self.final_estimator_, X_meta, y, sample_weight=sample_weight)
return self
@@ -253,8 +248,10 @@ def __init__(
)
# If all estimators are prefit, create fitted attrs
- if all(e[1] == "drop" or check_is_fitted(e[1], False) for e in self.estimators):
- self.estimators_ = [e[1] for e in self.estimators if e[1] != "drop"]
+ if all(
+ est[1] == "drop" or check_is_fitted(est[1], exception=False) for est in self.estimators
+ ):
+ self.estimators_ = [est[1] for est in self.estimators if est[1] != "drop"]
self._get_fitted_attrs()
def fit(
@@ -291,9 +288,7 @@ def fit(
)
if self.voting not in ("soft", "hard"):
- raise ValueError(
- f"Voting must be 'soft' or 'hard', got (voting={self.voting})."
- )
+ raise ValueError(f"Voting must be 'soft' or 'hard', got (voting={self.voting}).")
if self.weights is not None and len(self.weights) != len(self.estimators):
raise ValueError(
@@ -357,7 +352,9 @@ def __init__(
)
# If all estimators are prefit, create fitted attrs
- if all(e[1] == "drop" or check_is_fitted(e[1], False) for e in self.estimators):
+ if all(
+ est[1] == "drop" or check_is_fitted(est[1], exception=False) for est in self.estimators
+ ):
self.estimators_ = [est[1] for est in self.estimators if est[1] != "drop"]
self._get_fitted_attrs()
diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py
index 6df2da2f7..6d2aba266 100644
--- a/atom/feature_engineering.py
+++ b/atom/feature_engineering.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -212,13 +210,11 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
)
# If >30% values are NaT, the conversion was unsuccessful
- if 100. * col_dt.isna().sum() / len(X) >= 30:
+ if 100.0 * col_dt.isna().sum() / len(X) >= 30:
continue # Skip this column
else:
i += 1
- self._log(
- f" --> Extracting features from categorical column {name}.", 1
- )
+ self._log(f" --> Extracting features from categorical column {name}.", 1)
# Extract features from the datetime column
for fx in map(str.lower, lst(self.features)):
@@ -233,8 +229,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
# Skip if the information is not present in the format
if not isinstance(values, series_t):
self._log(
- f" --> Extracting feature {fx} failed. "
- "Result is not a Series.dt.", 2
+ f" --> Extracting feature {fx} failed. Result is not a Series.dt.", 2
)
continue
@@ -459,18 +454,18 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
- all_operators = dict(
- add="add_numeric",
- sub="subtract_numeric",
- mul="multiply_numeric",
- div="divide_numeric",
- abs="absolute",
- sqrt="square_root",
- log="natural_logarithm",
- sin="sine",
- cos="cosine",
- tan="tangent",
- )
+ all_operators = {
+ "add": "add_numeric",
+ "sub": "subtract_numeric",
+ "mul": "multiply_numeric",
+ "div": "divide_numeric",
+ "abs": "absolute",
+ "sqrt": "square_root",
+ "log": "natural_logarithm",
+ "sin": "sine",
+ "cos": "cosine",
+ "tan": "tangent",
+ }
if not self.operators: # None or empty list
operators = list(all_operators)
@@ -492,7 +487,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
)
# Select the new features (dfs also returns originals)
- self._dfs = self._dfs[X.shape[1] - 1:]
+ self._dfs = self._dfs[X.shape[1] - 1 :]
# Get a random selection of features
if self.n_features and self.n_features < len(self._dfs):
@@ -558,16 +553,15 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
df = pd.DataFrame(
data=[
["", str(fx), fx.fitness_]
- for i, fx in enumerate(self.gfg_) if str(fx) not in X.columns
+ for i, fx in enumerate(self.gfg_)
+ if str(fx) not in X.columns
],
columns=["name", "description", "fitness"],
)
# Check if any new features remain
if len(df) == 0:
- self._log(
- " --> The genetic algorithm didn't find any improving features.", 2
- )
+ self._log(" --> The genetic algorithm didn't find any improving features.", 2)
return X
# Select the n_features with the highest fitness
@@ -578,7 +572,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
if len(df) != self.n_features:
self._log(
f" --> Dropping {(self.n_features or len(self.gfg_)) - len(df)} "
- "features due to repetition.", 2)
+ "features due to repetition.",
+ 2,
+ )
for i, array in enumerate(self.gfg_.transform(X)[:, df.index].T):
# If the column is new, use a default name
@@ -587,7 +583,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
name = f"x{X.shape[1] + counter}"
if name not in X:
X[name] = array # Add new feature to X
- df.iat[i, 0] = name
+ df.iloc[i, 0] = name
break
else:
counter += 1
@@ -734,7 +730,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
raise ValueError(
"Invalid value for the operators parameter. Value "
f"{operator} is not an attribute of numpy nor scipy.stats."
- )
+ ) from None
try:
X[f"{operator}({name})"] = result
@@ -742,7 +738,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
raise ValueError(
"Invalid value for the operators parameter. Value "
f"{operator} doesn't return a one-dimensional array."
- )
+ ) from None
to_drop.extend(group)
self._log(f" --> Group {name} successfully created.", 2)
@@ -917,21 +913,22 @@ class FeatureSelector(TransformerMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -1033,7 +1030,7 @@ def __init__(
max_correlation: FloatZeroToOneInc | None = 1.0,
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
@@ -1115,19 +1112,19 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
self._low_variance: dict[Hashable, tuple[Hashable, float]] = {}
self._n_features = None
- strategies = dict(
- univariate="SelectKBest",
- pca="PCA",
- sfm="SelectFromModel",
- sfs="SequentialFeatureSelector",
- rfe="RFE",
- rfecv="RFECV",
- pso=ParticleSwarmOptimization,
- hho=HarrisHawkOptimization,
- gwo=GreyWolfOptimization,
- dfo=DragonFlyOptimization,
- go=GeneticOptimization,
- )
+ strategies = {
+ "univariate": "SelectKBest",
+ "pca": "PCA",
+ "sfm": "SelectFromModel",
+ "sfs": "SequentialFeatureSelector",
+ "rfe": "RFE",
+ "rfecv": "RFECV",
+ "pso": ParticleSwarmOptimization,
+ "hho": HarrisHawkOptimization,
+ "gwo": GreyWolfOptimization,
+ "dfo": DragonFlyOptimization,
+ "go": GeneticOptimization,
+ }
if isinstance(self.strategy, str):
if self.strategy not in ("univariate", "pca"):
@@ -1155,7 +1152,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
goal=goal,
**{
x: getattr(self, x)
- for x in BaseTransformer.attrs if hasattr(self, x)
+ for x in BaseTransformer.attrs
+ if hasattr(self, x)
},
)
model.task = goal.infer_task(y)
@@ -1169,7 +1167,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
solver = self.solver
elif self.kwargs:
- kw = ", ".join([f"{str(k)}={str(v)}" for k, v in self.kwargs.items()])
+ kw = ", ".join([f"{k}={v}" for k, v in self.kwargs.items()])
raise ValueError(
f"Keyword arguments ({kw}) are specified for "
"the strategy estimator but no strategy is selected."
@@ -1220,7 +1218,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
for name, column in X.select_dtypes(exclude="number").items():
for category, count in column.value_counts().items():
if count >= max_repeated:
- self._low_variance[name] = (category, 100. * count / len(X))
+ self._low_variance[name] = (category, 100.0 * count / len(X))
X = X.drop(columns=name)
break
@@ -1262,7 +1260,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
"corr_feature": [", ".join(corr_feature)],
"corr_value": [", ".join(corr_value)],
}
- )
+ ),
],
ignore_index=True,
)
@@ -1273,13 +1271,13 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
return self # Exit feature_engineering
elif self.strategy == "univariate":
- solvers_dct = dict(
- f_classif=f_classif,
- f_regression=f_regression,
- mutual_info_classif=mutual_info_classif,
- mutual_info_regression=mutual_info_regression,
- chi2=chi2,
- )
+ solvers_dct = {
+ "f_classif": f_classif,
+ "f_regression": f_regression,
+ "mutual_info_classif": mutual_info_classif,
+ "mutual_info_regression": mutual_info_regression,
+ "chi2": chi2,
+ }
if not self.solver:
raise ValueError(
@@ -1327,9 +1325,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
**self.kwargs,
).fit(X)
- self._estimator._comps = min(
- self._estimator.components_.shape[0], self._n_features
- )
+ self._estimator._comps = min(self._estimator.components_.shape[0], self._n_features)
elif self.strategy == "sfm":
# If any of these attr exists, the model is already fitted
@@ -1480,7 +1476,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(
f" --> Feature {fx} was removed due to high variance. "
f"Value {h_variance[0]} was the most repeated value with "
- f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2
+ f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.",
+ 2,
)
X = X.drop(columns=fx)
@@ -1488,7 +1485,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
for fx, l_variance in self._low_variance.items():
self._log(
f" --> Feature {fx} was removed due to low variance. Value "
- f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.", 2
+ f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.",
+ 2,
)
X = X.drop(columns=fx)
@@ -1507,14 +1505,16 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
elif self.strategy == "univariate":
self._log(
f" --> The univariate test selected "
- f"{self._n_features} features from the dataset.", 2
+ f"{self._n_features} features from the dataset.",
+ 2,
)
for n, column in enumerate(X):
if not self.univariate_.get_support()[n]:
self._log(
f" --> Dropping feature {column} "
f"(score: {self.univariate_.scores_[n]:.2f} "
- f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2
+ f"p-value: {self.univariate_.pvalues_[n]:.2f}).",
+ 2,
)
X = X.drop(columns=column)
@@ -1525,17 +1525,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(" --> Scaling features...", 2)
X = self.scaler_.transform(X)
- X = self.pca_.transform(X).iloc[:, :self.pca_._comps]
+ X = self.pca_.transform(X).iloc[:, : self.pca_._comps]
- var = np.array(self.pca_.explained_variance_ratio_[:self._n_features])
+ var = np.array(self.pca_.explained_variance_ratio_[: self._n_features])
self._log(f" --> Keeping {self.pca_._comps} components.", 2)
self._log(f" --> Explained variance ratio: {round(var.sum(), 3)}", 2)
elif self.strategy in ("sfm", "sfs", "rfe", "rfecv"):
mask = self._estimator.get_support()
self._log(
- f" --> {self.strategy} selected "
- f"{sum(mask)} features from the dataset.", 2
+ f" --> {self.strategy} selected {sum(mask)} features from the dataset.",
+ 2,
)
for n, column in enumerate(X):
@@ -1543,7 +1543,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
if hasattr(self._estimator, "ranking_"):
self._log(
f" --> Dropping feature {column} "
- f"(rank {self._estimator.ranking_[n]}).", 2
+ f"(rank {self._estimator.ranking_[n]}).",
+ 2,
)
else:
self._log(f" --> Dropping feature {column}.", 2)
@@ -1552,7 +1553,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
else: # Advanced strategies
self._log(
f" --> {self.strategy} selected {len(self._estimator.best_feature_list)} "
- "features from the dataset.", 2
+ "features from the dataset.",
+ 2,
)
for column in X:
diff --git a/atom/models/__init__.py b/atom/models/__init__.py
index 12feebdf2..7bdbc8ab7 100644
--- a/atom/models/__init__.py
+++ b/atom/models/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/models/classreg.py b/atom/models/classreg.py
index 938621ebc..cab1144db 100644
--- a/atom/models/classreg.py
+++ b/atom/models/classreg.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,7 @@
from __future__ import annotations
-from typing import Any, cast
+from typing import Any, ClassVar, cast
import numpy as np
import pandas as pd
@@ -70,10 +68,10 @@ class AdaBoost(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "AdaBoostClassifier",
"regression": "AdaBoostRegressor",
}
@@ -87,10 +85,10 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- n_estimators=Int(50, 500, step=10),
- learning_rate=Float(0.01, 10, log=True),
- )
+ dist = {
+ "n_estimators": Int(50, 500, step=10),
+ "learning_rate": Float(0.01, 10, log=True),
+ }
if self._goal is Goal.classification:
dist["algorithm"] = Cat(["SAMME.R", "SAMME"])
@@ -140,10 +138,10 @@ class AutomaticRelevanceDetermination(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"regression": "ARDRegression"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "ARDRegression"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -155,13 +153,13 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_iter=Int(100, 1000, step=10),
- alpha_1=Float(1e-4, 1, log=True),
- alpha_2=Float(1e-4, 1, log=True),
- lambda_1=Float(1e-4, 1, log=True),
- lambda_2=Float(1e-4, 1, log=True),
- )
+ return {
+ "n_iter": Int(100, 1000, step=10),
+ "alpha_1": Float(1e-4, 1, log=True),
+ "alpha_2": Float(1e-4, 1, log=True),
+ "lambda_1": Float(1e-4, 1, log=True),
+ "lambda_2": Float(1e-4, 1, log=True),
+ }
class Bagging(ClassRegModel):
@@ -208,10 +206,10 @@ class Bagging(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "BaggingClassifier",
"regression": "BaggingRegressor",
}
@@ -226,13 +224,13 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_estimators=Int(10, 500, step=10),
- max_samples=Float(0.5, 1.0, step=0.1),
- max_features=Float(0.5, 1.0, step=0.1),
- bootstrap=Cat([True, False]),
- bootstrap_features=Cat([True, False]),
- )
+ return {
+ "n_estimators": Int(10, 500, step=10),
+ "max_samples": Float(0.5, 1.0, step=0.1),
+ "max_features": Float(0.5, 1.0, step=0.1),
+ "bootstrap": Cat([True, False]),
+ "bootstrap_features": Cat([True, False]),
+ }
class BayesianRidge(ClassRegModel):
@@ -274,10 +272,10 @@ class BayesianRidge(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"regression": "BayesianRidge"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "BayesianRidge"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -289,13 +287,13 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_iter=Int(100, 1000, step=10),
- alpha_1=Float(1e-4, 1, log=True),
- alpha_2=Float(1e-4, 1, log=True),
- lambda_1=Float(1e-4, 1, log=True),
- lambda_2=Float(1e-4, 1, log=True),
- )
+ return {
+ "n_iter": Int(100, 1000, step=10),
+ "alpha_1": Float(1e-4, 1, log=True),
+ "alpha_2": Float(1e-4, 1, log=True),
+ "lambda_1": Float(1e-4, 1, log=True),
+ "lambda_2": Float(1e-4, 1, log=True),
+ }
class BernoulliNB(ClassRegModel):
@@ -338,10 +336,10 @@ class BernoulliNB(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "naive_bayes"
- _estimators = {"classification": "BernoulliNB"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "BernoulliNB"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -353,10 +351,10 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
+ return {
+ "alpha": Float(0.01, 10, log=True),
+ "fit_prior": Cat([True, False]),
+ }
class CatBoost(ClassRegModel):
@@ -417,10 +415,10 @@ class CatBoost(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "n_estimators"
- supports_engines = ["catboost"]
+ supports_engines = ("catboost",)
_module = "catboost"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "CatBoostClassifier",
"regression": "CatBoostRegressor",
}
@@ -533,7 +531,7 @@ def _fit_estimator(
if trial and len(self._metric) == 1 and cb._pruned:
# Add the pruned step to the output
- step = len(self.evals[f'{m}_train'])
+ step = len(self.evals[f"{m}_train"])
steps = estimator.get_params()[self.has_validation]
trial.params[self.has_validation] = f"{step}/{steps}"
@@ -552,16 +550,16 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Cat([None, *range(1, 17)]),
- min_child_samples=Int(1, 30),
- bootstrap_type=Cat(["Bayesian", "Bernoulli"]),
- bagging_temperature=Float(0, 10),
- subsample=Float(0.5, 1.0, step=0.1),
- reg_lambda=Float(0.001, 100, log=True),
- )
+ return {
+ "n_estimators": Int(20, 500, step=10),
+ "learning_rate": Float(0.01, 1.0, log=True),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_child_samples": Int(1, 30),
+ "bootstrap_type": Cat(["Bayesian", "Bernoulli"]),
+ "bagging_temperature": Float(0, 10),
+ "subsample": Float(0.5, 1.0, step=0.1),
+ "reg_lambda": Float(0.001, 100, log=True),
+ }
class CategoricalNB(ClassRegModel):
@@ -588,8 +586,9 @@ class CategoricalNB(ClassRegModel):
from atom import ATOMClassifier
import numpy as np
- X = np.random.randint(5, size=(100, 100))
- y = np.random.randint(2, size=100)
+ rng = np.random.default_rng()
+ X = rng.integers(5, size=(100, 100))
+ y = rng.integers(2, size=100)
atom = ATOMClassifier(X, y, random_state=1)
atom.run(models="CatNB", metric="f1", verbose=2)
@@ -603,10 +602,10 @@ class CategoricalNB(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "naive_bayes"
- _estimators = {"classification": "CategoricalNB"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "CategoricalNB"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -618,10 +617,10 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
+ return {
+ "alpha": Float(0.01, 10, log=True),
+ "fit_prior": Cat([True, False]),
+ }
class ComplementNB(ClassRegModel):
@@ -663,10 +662,10 @@ class ComplementNB(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "naive_bayes"
- _estimators = {"classification": "ComplementNB"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "ComplementNB"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -678,11 +677,11 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- norm=Cat([True, False]),
- )
+ return {
+ "alpha": Float(0.01, 10, log=True),
+ "fit_prior": Cat([True, False]),
+ "norm": Cat([True, False]),
+ }
class DecisionTree(ClassRegModel):
@@ -723,10 +722,10 @@ class DecisionTree(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "tree"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "DecisionTreeClassifier",
"regression": "DecisionTreeRegressor",
}
@@ -745,15 +744,15 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
else:
criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
- return dict(
- criterion=Cat(criterion),
- splitter=Cat(["best", "random"]),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
+ return {
+ "criterion": Cat(criterion),
+ "splitter": Cat(["best", "random"]),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_samples_split": Int(2, 20),
+ "min_samples_leaf": Int(1, 20),
+ "max_features": Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "ccp_alpha": Float(0, 0.035, step=0.005),
+ }
class Dummy(ClassRegModel):
@@ -798,10 +797,13 @@ class Dummy(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "dummy"
- _estimators = {"classification": "DummyClassifier", "regression": "DummyRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "DummyClassifier",
+ "regression": "DummyRegressor",
+ }
def _get_distributions(self) -> dict[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
@@ -812,10 +814,10 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- strategy=Cat(["most_frequent", "prior", "stratified", "uniform"]),
- quantile=Float(0, 1.0, step=0.1),
- )
+ dist = {
+ "strategy": Cat(["most_frequent", "prior", "stratified", "uniform"]),
+ "quantile": Float(0, 1.0, step=0.1),
+ }
if self._goal is Goal.classification:
dist.pop("quantile")
@@ -862,10 +864,10 @@ class ElasticNet(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "linear_model"
- _estimators = {"regression": "ElasticNet"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "ElasticNet"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -877,11 +879,11 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(1e-3, 10, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- selection=Cat(["cyclic", "random"]),
- )
+ return {
+ "alpha": Float(1e-3, 10, log=True),
+ "l1_ratio": Float(0.1, 0.9, step=0.1),
+ "selection": Cat(["cyclic", "random"]),
+ }
class ExtraTree(ClassRegModel):
@@ -927,10 +929,10 @@ class ExtraTree(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "tree"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "ExtraTreeClassifier",
"regression": "ExtraTreeRegressor",
}
@@ -949,15 +951,15 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
else:
criterion = ["squared_error", "absolute_error"]
- return dict(
- criterion=Cat(criterion),
- splitter=Cat(["random", "best"]),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
+ return {
+ "criterion": Cat(criterion),
+ "splitter": Cat(["random", "best"]),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_samples_split": Int(2, 20),
+ "min_samples_leaf": Int(1, 20),
+ "max_features": Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "ccp_alpha": Float(0, 0.035, step=0.005),
+ }
class ExtraTrees(ClassRegModel):
@@ -1001,10 +1003,10 @@ class ExtraTrees(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "ExtraTreesClassifier",
"regression": "ExtraTreesRegressor",
}
@@ -1045,17 +1047,17 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
else:
criterion = ["squared_error", "absolute_error"]
- return dict(
- n_estimators=Int(10, 500, step=10),
- criterion=Cat(criterion),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- bootstrap=Cat([True, False]),
- max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
+ return {
+ "n_estimators": Int(10, 500, step=10),
+ "criterion": Cat(criterion),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_samples_split": Int(2, 20),
+ "min_samples_leaf": Int(1, 20),
+ "max_features": Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "bootstrap": Cat([True, False]),
+ "max_samples": Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "ccp_alpha": Float(0, 0.035, step=0.005),
+ }
class GaussianNB(ClassRegModel):
@@ -1097,10 +1099,10 @@ class GaussianNB(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "naive_bayes"
- _estimators = {"classification": "GaussianNB"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "GaussianNB"}
class GaussianProcess(ClassRegModel):
@@ -1156,10 +1158,10 @@ class GaussianProcess(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "gaussian_process"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "GaussianProcessClassifier",
"regression": "GaussianProcessRegressor",
}
@@ -1212,10 +1214,10 @@ class GradientBoostingMachine(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "GradientBoostingClassifier",
"regression": "GradientBoostingRegressor",
}
@@ -1229,18 +1231,18 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- loss=Cat(["log_loss", "exponential"]),
- learning_rate=Float(0.01, 1.0, log=True),
- n_estimators=Int(10, 500, step=10),
- subsample=Float(0.5, 1.0, step=0.1),
- criterion=Cat(["friedman_mse", "squared_error"]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_depth=Int(1, 21),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
+ dist = {
+ "loss": Cat(["log_loss", "exponential"]),
+ "learning_rate": Float(0.01, 1.0, log=True),
+ "n_estimators": Int(10, 500, step=10),
+ "subsample": Float(0.5, 1.0, step=0.1),
+ "criterion": Cat(["friedman_mse", "squared_error"]),
+ "min_samples_split": Int(2, 20),
+ "min_samples_leaf": Int(1, 20),
+ "max_depth": Int(1, 21),
+ "max_features": Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "ccp_alpha": Float(0, 0.035, step=0.005),
+ }
# Avoid 'task' when class initialized without branches
if "_branch" in self.__dict__ and self.task.is_multiclass:
@@ -1291,10 +1293,10 @@ class HuberRegression(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"regression": "HuberRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "HuberRegressor"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -1306,11 +1308,11 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- epsilon=Float(1, 10, log=True),
- max_iter=Int(50, 500, step=10),
- alpha=Float(1e-4, 1, log=True),
- )
+ return {
+ "epsilon": Float(1, 10, log=True),
+ "max_iter": Int(50, 500, step=10),
+ "alpha": Float(1e-4, 1, log=True),
+ }
class HistGradientBoosting(ClassRegModel):
@@ -1357,10 +1359,10 @@ class HistGradientBoosting(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "HistGradientBoostingClassifier",
"regression": "HistGradientBoostingRegressor",
}
@@ -1374,16 +1376,16 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- loss=Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]),
- quantile=Float(0, 1, step=0.1),
- learning_rate=Float(0.01, 1.0, log=True),
- max_iter=Int(10, 500, step=10),
- max_leaf_nodes=Int(10, 50),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_leaf=Int(10, 30),
- l2_regularization=Float(0, 1.0, step=0.1),
- )
+ dist = {
+ "loss": Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]),
+ "quantile": Float(0, 1, step=0.1),
+ "learning_rate": Float(0.01, 1.0, log=True),
+ "max_iter": Int(10, 500, step=10),
+ "max_leaf_nodes": Int(10, 50),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_samples_leaf": Int(10, 30),
+ "l2_regularization": Float(0, 1.0, step=0.1),
+ }
if self._goal is Goal.classification:
dist.pop("loss")
@@ -1433,10 +1435,10 @@ class KNearestNeighbors(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "neighbors"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "KNeighborsClassifier",
"regression": "KNeighborsRegressor",
}
@@ -1450,13 +1452,13 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- n_neighbors=Int(1, 100),
- weights=Cat(["uniform", "distance"]),
- algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
- leaf_size=Int(20, 40),
- p=Int(1, 2),
- )
+ dist = {
+ "n_neighbors": Int(1, 100),
+ "weights": Cat(["uniform", "distance"]),
+ "algorithm": Cat(["auto", "ball_tree", "kd_tree", "brute"]),
+ "leaf_size": Int(20, 40),
+ "p": Int(1, 2),
+ }
if self._gpu:
dist.pop("algorithm") # Only 'brute' is supported
@@ -1505,10 +1507,10 @@ class Lasso(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "linear_model"
- _estimators = {"regression": "Lasso"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "Lasso"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -1520,10 +1522,10 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(1e-3, 10, log=True),
- selection=Cat(["cyclic", "random"]),
- )
+ return {
+ "alpha": Float(1e-3, 10, log=True),
+ "selection": Cat(["cyclic", "random"]),
+ }
class LeastAngleRegression(ClassRegModel):
@@ -1568,10 +1570,10 @@ class LeastAngleRegression(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"regression": "Lars"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "Lars"}
class LightGBM(ClassRegModel):
@@ -1623,10 +1625,13 @@ class LightGBM(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "n_estimators"
- supports_engines = ["lightgbm"]
+ supports_engines = ("lightgbm",)
_module = "lightgbm.sklearn"
- _estimators = {"classification": "LGBMClassifier", "regression": "LGBMRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "LGBMClassifier",
+ "regression": "LGBMRegressor",
+ }
def _get_est(self, params: dict[str, Any]) -> Predictor:
"""Get the model's estimator with unpacked parameters.
@@ -1644,7 +1649,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
"""
# Custom lightgbm mapping for warnings
# PYTHONWarnings doesn't work since they go from C/C++ code to stdout
- warns = dict(always=2, default=1, once=0, error=0, ignore=-1)
+ warns = {"always": 2, "default": 1, "once": 0, "error": 0, "ignore": -1}
return self._est_class(
verbose=params.pop("verbose", warns.get(self.warnings, -1)),
@@ -1694,7 +1699,7 @@ def _fit_estimator(
m = self._metric[0].name
params = est_params_fit.copy()
- callbacks = params.pop("callbacks", []) + [log_evaluation(-1)]
+ callbacks = [*params.pop("callbacks", []), log_evaluation(-1)]
if trial and len(self._metric) == 1:
callbacks.append(LightGBMPruningCallback(trial, m, "valid_1"))
@@ -1738,18 +1743,18 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Int(-1, 17, step=2),
- num_leaves=Int(20, 40),
- min_child_weight=Float(1e-4, 100, log=True),
- min_child_samples=Int(1, 30),
- subsample=Float(0.5, 1.0, step=0.1),
- colsample_bytree=Float(0.4, 1.0, step=0.1),
- reg_alpha=Float(1e-4, 100, log=True),
- reg_lambda=Float(1e-4, 100, log=True),
- )
+ return {
+ "n_estimators": Int(20, 500, step=10),
+ "learning_rate": Float(0.01, 1.0, log=True),
+ "max_depth": Int(-1, 17, step=2),
+ "num_leaves": Int(20, 40),
+ "min_child_weight": Float(1e-4, 100, log=True),
+ "min_child_samples": Int(1, 30),
+ "subsample": Float(0.5, 1.0, step=0.1),
+ "colsample_bytree": Float(0.4, 1.0, step=0.1),
+ "reg_alpha": Float(1e-4, 100, log=True),
+ "reg_lambda": Float(1e-4, 100, log=True),
+ }
class LinearDiscriminantAnalysis(ClassRegModel):
@@ -1757,7 +1762,7 @@ class LinearDiscriminantAnalysis(ClassRegModel):
Linear Discriminant Analysis is a classifier with a linear
decision boundary, generated by fitting class conditional densities
- to the data and using Bayes’ rule. The model fits a Gaussian
+ to the data and using Bayes' rule. The model fits a Gaussian
density to each class, assuming that all classes share the same
covariance matrix.
@@ -1793,10 +1798,10 @@ class LinearDiscriminantAnalysis(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "discriminant_analysis"
- _estimators = {"classification": "LinearDiscriminantAnalysis"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "LinearDiscriminantAnalysis"}
def _get_parameters(self, trial: Trial) -> dict:
"""Get the trial's hyperparameters.
@@ -1830,10 +1835,10 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- solver=Cat(["svd", "lsqr", "eigen"]),
- shrinkage=Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
- )
+ return {
+ "solver": Cat(["svd", "lsqr", "eigen"]),
+ "shrinkage": Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
+ }
class LinearSVM(ClassRegModel):
@@ -1877,10 +1882,13 @@ class LinearSVM(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "svm"
- _estimators = {"classification": "LinearSVC", "regression": "LinearSVR"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "LinearSVC",
+ "regression": "LinearSVR",
+ }
def _get_parameters(self, trial: Trial) -> dict:
"""Get the trial's hyperparameters.
@@ -2003,10 +2011,10 @@ class LogisticRegression(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "linear_model"
- _estimators = {"classification": "LogisticRegression"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "LogisticRegression"}
def _get_parameters(self, trial: Trial) -> dict:
"""Get the trial's hyperparameters.
@@ -2046,13 +2054,13 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- penalty=Cat([None, "l1", "l2", "elasticnet"]),
- C=Float(1e-3, 100, log=True),
- solver=Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]),
- max_iter=Int(100, 1000, step=10),
- l1_ratio=Float(0, 1.0, step=0.1),
- )
+ dist = {
+ "penalty": Cat([None, "l1", "l2", "elasticnet"]),
+ "C": Float(1e-3, 100, log=True),
+ "solver": Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]),
+ "max_iter": Int(100, 1000, step=10),
+ "l1_ratio": Float(0, 1.0, step=0.1),
+ }
if self._gpu:
if self.engine.get("estimator") == "cuml":
@@ -2110,10 +2118,13 @@ class MultiLayerPerceptron(ClassRegModel):
native_multilabel = True
native_multioutput = False
has_validation = "max_iter"
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "neural_network"
- _estimators = {"classification": "MLPClassifier", "regression": "MLPRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "MLPClassifier",
+ "regression": "MLPRegressor",
+ }
def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
"""Convert trial's hyperparameters to parameters for the estimator.
@@ -2131,10 +2142,11 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
"""
params = super()._trial_to_est(params)
- hidden_layer_sizes = []
- for param in [p for p in sorted(params) if p.startswith("hidden_layer")]:
- if value := params.pop(param): # Neurons should be more than zero
- hidden_layer_sizes.append(value)
+ hidden_layer_sizes = [
+ value
+ for param in [p for p in sorted(params) if p.startswith("hidden_layer")]
+ if (value := params.pop(param)) # Neurons should be more than zero
+ ]
if hidden_layer_sizes:
params["hidden_layer_sizes"] = tuple(hidden_layer_sizes)
@@ -2150,19 +2162,19 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- hidden_layer_1=Int(10, 100),
- hidden_layer_2=Int(0, 100),
- hidden_layer_3=Int(0, 10),
- activation=Cat(["identity", "logistic", "tanh", "relu"]),
- solver=Cat(["lbfgs", "sgd", "adam"]),
- alpha=Float(1e-4, 0.1, log=True),
- batch_size=Cat(["auto", 8, 16, 32, 64, 128, 256]),
- learning_rate=Cat(["constant", "invscaling", "adaptive"]),
- learning_rate_init=Float(1e-3, 0.1, log=True),
- power_t=Float(0.1, 0.9, step=0.1),
- max_iter=Int(50, 500, step=10),
- )
+ dist = {
+ "hidden_layer_1": Int(10, 100),
+ "hidden_layer_2": Int(0, 100),
+ "hidden_layer_3": Int(0, 10),
+ "activation": Cat(["identity", "logistic", "tanh", "relu"]),
+ "solver": Cat(["lbfgs", "sgd", "adam"]),
+ "alpha": Float(1e-4, 0.1, log=True),
+ "batch_size": Cat(["auto", 8, 16, 32, 64, 128, 256]),
+ "learning_rate": Cat(["constant", "invscaling", "adaptive"]),
+ "learning_rate_init": Float(1e-3, 0.1, log=True),
+ "power_t": Float(0.1, 0.9, step=0.1),
+ "max_iter": Int(50, 500, step=10),
+ }
# Drop layers if user specifies sizes
if "hidden_layer_sizes" in self._est_params:
@@ -2212,10 +2224,10 @@ class MultinomialNB(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "cuml"]
+ supports_engines = ("sklearn", "cuml")
_module = "naive_bayes"
- _estimators = {"classification": "MultinomialNB"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "MultinomialNB"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -2227,10 +2239,10 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
+ return {
+ "alpha": Float(0.01, 10, log=True),
+ "fit_prior": Cat([True, False]),
+ }
class OrdinaryLeastSquares(ClassRegModel):
@@ -2274,10 +2286,10 @@ class OrdinaryLeastSquares(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "linear_model"
- _estimators = {"regression": "LinearRegression"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "LinearRegression"}
class OrthogonalMatchingPursuit(ClassRegModel):
@@ -2319,10 +2331,10 @@ class OrthogonalMatchingPursuit(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"regression": "OrthogonalMatchingPursuit"}
+ _estimators: ClassVar[dict[str, str]] = {"regression": "OrthogonalMatchingPursuit"}
class PassiveAggressive(ClassRegModel):
@@ -2366,10 +2378,10 @@ class PassiveAggressive(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "max_iter"
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "PassiveAggressiveClassifier",
"regression": "PassiveAggressiveRegressor",
}
@@ -2388,12 +2400,12 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
else:
loss = ["epsilon_insensitive", "squared_epsilon_insensitive"]
- return dict(
- C=Float(1e-3, 100, log=True),
- max_iter=Int(500, 1500, step=50),
- loss=Cat(loss),
- average=Cat([True, False]),
- )
+ return {
+ "C": Float(1e-3, 100, log=True),
+ "max_iter": Int(500, 1500, step=50),
+ "loss": Cat(loss),
+ "average": Cat([True, False]),
+ }
class Perceptron(ClassRegModel):
@@ -2442,10 +2454,10 @@ class Perceptron(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "max_iter"
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"classification": "Perceptron"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "Perceptron"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -2457,13 +2469,13 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- penalty=Cat([None, "l2", "l1", "elasticnet"]),
- alpha=Float(1e-4, 10, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- max_iter=Int(500, 1500, step=50),
- eta0=Float(1e-2, 10, log=True),
- )
+ return {
+ "penalty": Cat([None, "l2", "l1", "elasticnet"]),
+ "alpha": Float(1e-4, 10, log=True),
+ "l1_ratio": Float(0.1, 0.9, step=0.1),
+ "max_iter": Int(500, 1500, step=50),
+ "eta0": Float(1e-2, 10, log=True),
+ }
class QuadraticDiscriminantAnalysis(ClassRegModel):
@@ -2471,7 +2483,7 @@ class QuadraticDiscriminantAnalysis(ClassRegModel):
Quadratic Discriminant Analysis is a classifier with a quadratic
decision boundary, generated by fitting class conditional densities
- to the data and using Bayes’ rule. The model fits a Gaussian
+ to the data and using Bayes' rule. The model fits a Gaussian
density to each class, assuming that all classes share the same
covariance matrix.
@@ -2507,10 +2519,10 @@ class QuadraticDiscriminantAnalysis(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "discriminant_analysis"
- _estimators = {"classification": "QuadraticDiscriminantAnalysis"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "QuadraticDiscriminantAnalysis"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -2522,7 +2534,7 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(reg_param=Float(0, 1.0, step=0.1))
+ return {"reg_param": Float(0, 1.0, step=0.1)}
class RadiusNearestNeighbors(ClassRegModel):
@@ -2577,10 +2589,10 @@ class RadiusNearestNeighbors(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "neighbors"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "RadiusNeighborsClassifier",
"regression": "RadiusNeighborsRegressor",
}
@@ -2595,13 +2607,13 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- radius=Float(1e-2, 100),
- weights=Cat(["uniform", "distance"]),
- algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
- leaf_size=Int(20, 40),
- p=Int(1, 2),
- )
+ return {
+ "radius": Float(1e-2, 100),
+ "weights": Cat(["uniform", "distance"]),
+ "algorithm": Cat(["auto", "ball_tree", "kd_tree", "brute"]),
+ "leaf_size": Int(20, 40),
+ "p": Int(1, 2),
+ }
class RandomForest(ClassRegModel):
@@ -2653,10 +2665,10 @@ class RandomForest(ClassRegModel):
native_multilabel = True
native_multioutput = True
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "ensemble"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "RandomForestClassifier",
"regression": "RandomForestRegressor",
}
@@ -2700,17 +2712,17 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
else:
criterion = ["squared_error", "absolute_error", "poisson"]
- dist = dict(
- n_estimators=Int(10, 500, step=10),
- criterion=Cat(criterion),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- bootstrap=Cat([True, False]),
- max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
+ dist = {
+ "n_estimators": Int(10, 500, step=10),
+ "criterion": Cat(criterion),
+ "max_depth": Cat([None, *range(1, 17)]),
+ "min_samples_split": Int(2, 20),
+ "min_samples_leaf": Int(1, 20),
+ "max_features": Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "bootstrap": Cat([True, False]),
+ "max_samples": Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
+ "ccp_alpha": Float(0, 0.035, step=0.005),
+ }
if self.engine.get("estimator") == "sklearnex":
dist.pop("criterion")
@@ -2768,10 +2780,13 @@ class Ridge(ClassRegModel):
native_multilabel = True
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "linear_model"
- _estimators = {"classification": "RidgeClassifier", "regression": "Ridge"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "RidgeClassifier",
+ "regression": "Ridge",
+ }
def _get_distributions(self) -> dict[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
@@ -2782,10 +2797,10 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- alpha=Float(1e-3, 10, log=True),
- solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
- )
+ dist = {
+ "alpha": Float(1e-3, 10, log=True),
+ "solver": Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
+ }
if self._goal is Goal.regression:
if self.engine.get("estimator") == "sklearnex":
@@ -2838,10 +2853,13 @@ class StochasticGradientDescent(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "max_iter"
- supports_engines = ["sklearn"]
+ supports_engines = ("sklearn",)
_module = "linear_model"
- _estimators = {"classification": "SGDClassifier", "regression": "SGDRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "SGDClassifier",
+ "regression": "SGDRegressor",
+ }
def _get_distributions(self) -> dict[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
@@ -2864,18 +2882,18 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
"squared_epsilon_insensitive",
]
- return dict(
- loss=Cat(loss if self._goal is Goal.classification else loss[-4:]),
- penalty=Cat([None, "l1", "l2", "elasticnet"]),
- alpha=Float(1e-4, 1.0, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- max_iter=Int(500, 1500, step=50),
- epsilon=Float(1e-4, 1.0, log=True),
- learning_rate=Cat(["constant", "invscaling", "optimal", "adaptive"]),
- eta0=Float(1e-2, 10, log=True),
- power_t=Float(0.1, 0.9, step=0.1),
- average=Cat([True, False]),
- )
+ return {
+ "loss": Cat(loss if self._goal is Goal.classification else loss[-4:]),
+ "penalty": Cat([None, "l1", "l2", "elasticnet"]),
+ "alpha": Float(1e-4, 1.0, log=True),
+ "l1_ratio": Float(0.1, 0.9, step=0.1),
+ "max_iter": Int(500, 1500, step=50),
+ "epsilon": Float(1e-4, 1.0, log=True),
+ "learning_rate": Cat(["constant", "invscaling", "optimal", "adaptive"]),
+ "eta0": Float(1e-2, 10, log=True),
+ "power_t": Float(0.1, 0.9, step=0.1),
+ "average": Cat([True, False]),
+ }
class SupportVectorMachine(ClassRegModel):
@@ -2920,10 +2938,10 @@ class SupportVectorMachine(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
+ supports_engines = ("sklearn", "sklearnex", "cuml")
_module = "svm"
- _estimators = {"classification": "SVC", "regression": "SVR"}
+ _estimators: ClassVar[dict[str, str]] = {"classification": "SVC", "regression": "SVR"}
def _get_parameters(self, trial: Trial) -> dict:
"""Get the trial's hyperparameters.
@@ -2965,7 +2983,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
return self._est_class(
probability=params.pop("probability", True),
random_state=params.pop("random_state", self.random_state),
- **params)
+ **params,
+ )
else:
return super()._get_est(params)
@@ -2978,15 +2997,15 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- dist = dict(
- C=Float(1e-3, 100, log=True),
- kernel=Cat(["linear", "poly", "rbf", "sigmoid"]),
- degree=Int(2, 5),
- gamma=Cat(["scale", "auto"]),
- coef0=Float(-1.0, 1.0),
- epsilon=Float(1e-3, 100, log=True),
- shrinking=Cat([True, False]),
- )
+ dist = {
+ "C": Float(1e-3, 100, log=True),
+ "kernel": Cat(["linear", "poly", "rbf", "sigmoid"]),
+ "degree": Int(2, 5),
+ "gamma": Cat(["scale", "auto"]),
+ "coef0": Float(-1.0, 1.0),
+ "epsilon": Float(1e-3, 100, log=True),
+ "shrinking": Cat([True, False]),
+ }
if self._goal is Goal.classification:
dist.pop("epsilon")
@@ -3039,10 +3058,13 @@ class XGBoost(ClassRegModel):
native_multilabel = False
native_multioutput = False
has_validation = "n_estimators"
- supports_engines = ["xgboost"]
+ supports_engines = ("xgboost",)
_module = "xgboost"
- _estimators = {"classification": "XGBClassifier", "regression": "XGBRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "XGBClassifier",
+ "regression": "XGBRegressor",
+ }
@property
def trials(self) -> pd.DataFrame:
@@ -3177,14 +3199,14 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Int(1, 20),
- gamma=Float(0, 1.0),
- min_child_weight=Int(1, 10),
- subsample=Float(0.5, 1.0, step=0.1),
- colsample_bytree=Float(0.4, 1.0, step=0.1),
- reg_alpha=Float(1e-4, 100, log=True),
- reg_lambda=Float(1e-4, 100, log=True),
- )
+ return {
+ "n_estimators": Int(20, 500, step=10),
+ "learning_rate": Float(0.01, 1.0, log=True),
+ "max_depth": Int(1, 20),
+ "gamma": Float(0, 1.0),
+ "min_child_weight": Int(1, 10),
+ "subsample": Float(0.5, 1.0, step=0.1),
+ "colsample_bytree": Float(0.4, 1.0, step=0.1),
+ "reg_alpha": Float(1e-4, 100, log=True),
+ "reg_lambda": Float(1e-4, 100, log=True),
+ }
diff --git a/atom/models/custom.py b/atom/models/custom.py
index d9082b3e4..85bd79b72 100644
--- a/atom/models/custom.py
+++ b/atom/models/custom.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py
index fe7b7cbba..5c8e023dc 100644
--- a/atom/models/ensembles.py
+++ b/atom/models/ensembles.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,7 @@
from __future__ import annotations
-from typing import Any
+from typing import Any, ClassVar
from atom.basemodel import ClassRegModel
from atom.utils.types import Model, Predictor
@@ -34,10 +32,10 @@ class Stacking(ClassRegModel):
has_validation = None
native_multilabel = False
native_multioutput = False
- supports_engines: list[str] = []
+ supports_engines = ()
_module = "atom.ensembles"
- _estimators = {
+ _estimators: ClassVar[dict[str, str]] = {
"classification": "StackingClassifier",
"regression": "StackingRegressor",
}
@@ -64,8 +62,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
"""
return self._est_class(
estimators=[
- (m.name, m.export_pipeline() if m.scaler else m.estimator)
- for m in self._models
+ (m.name, m.export_pipeline() if m.scaler else m.estimator) for m in self._models
],
n_jobs=params.pop("n_jobs", self.n_jobs),
**params,
@@ -90,10 +87,13 @@ class Voting(ClassRegModel):
has_validation = None
native_multilabel = False
native_multioutput = False
- supports_engines: list[str] = []
+ supports_engines = ()
_module = "atom.ensembles"
- _estimators = {"classification": "VotingClassifier", "regression": "VotingRegressor"}
+ _estimators: ClassVar[dict[str, str]] = {
+ "classification": "VotingClassifier",
+ "regression": "VotingRegressor",
+ }
def __init__(self, models: list[Model], **kwargs):
self._models = models
@@ -126,8 +126,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
"""
return self._est_class(
estimators=[
- (m.name, m.export_pipeline() if m.scaler else m.estimator)
- for m in self._models
+ (m.name, m.export_pipeline() if m.scaler else m.estimator) for m in self._models
],
n_jobs=params.pop("n_jobs", self.n_jobs),
**params,
diff --git a/atom/models/ts.py b/atom/models/ts.py
index 4f7322d76..6443e4866 100644
--- a/atom/models/ts.py
+++ b/atom/models/ts.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,7 @@
from __future__ import annotations
-from typing import Any
+from typing import Any, ClassVar
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution as Cat
@@ -79,10 +77,10 @@ class ARIMA(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.arima"
- _estimators = {"forecast": "ARIMA"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "ARIMA"}
_order = ("p", "d", "q")
_sorder = ("P", "D", "Q", "S")
@@ -146,18 +144,18 @@ def _get_distributions(self) -> dict[str, BaseDistribution]:
"""
methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
- dist = dict(
- p=Int(0, 2),
- d=Int(0, 1),
- q=Int(0, 2),
- P=Int(0, 2),
- D=Int(0, 1),
- Q=Int(0, 2),
- S=Cat([0, 4, 6, 7, 12]),
- method=Cat(methods),
- maxiter=Int(50, 200, step=10),
- with_intercept=Cat([True, False]),
- )
+ dist = {
+ "p": Int(0, 2),
+ "d": Int(0, 1),
+ "q": Int(0, 2),
+ "P": Int(0, 2),
+ "D": Int(0, 1),
+ "Q": Int(0, 2),
+ "S": Cat([0, 4, 6, 7, 12]),
+ "method": Cat(methods),
+ "maxiter": Int(50, 200, step=10),
+ "with_intercept": Cat([True, False]),
+ }
# Drop order and seasonal_order params if specified by user
if "order" in self._est_params:
@@ -180,8 +178,8 @@ class AutoARIMA(ForecastModel):
is based on the commonly-used R function.
AutoARIMA works by conducting differencing tests (i.e.,
- Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or
- Phillips–Perron) to determine the order of differencing, d, and
+ Kwiatkowski-Phillips-Schmidt-Shin, Augmented Dickey-Fuller or
+ Phillips-Perron) to determine the order of differencing, d, and
then fitting models within defined ranges. AutoARIMA also seeks
to identify the optimal P and Q hyperparameters after conducting
the Canova-Hansen to determine the optimal order of seasonal
@@ -221,10 +219,10 @@ class AutoARIMA(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.arima"
- _estimators = {"forecast": "AutoARIMA"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "AutoARIMA"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -238,11 +236,11 @@ def _get_distributions() -> dict[str, BaseDistribution]:
"""
methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
- return dict(
- method=Cat(methods),
- maxiter=Int(50, 200, step=10),
- with_intercept=Cat([True, False]),
- )
+ return {
+ "method": Cat(methods),
+ "maxiter": Int(50, 200, step=10),
+ "with_intercept": Cat([True, False]),
+ }
class ExponentialSmoothing(ForecastModel):
@@ -282,10 +280,10 @@ class ExponentialSmoothing(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.exp_smoothing"
- _estimators = {"forecast": "ExponentialSmoothing"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "ExponentialSmoothing"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -299,15 +297,15 @@ def _get_distributions() -> dict[str, BaseDistribution]:
"""
methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"]
- return dict(
- trend=Cat(["add", "mul", None]),
- damped_trend=Cat([True, False]),
- seasonal=Cat(["add", "mul", None]),
- sp=Cat([4, 6, 7, 12, None]),
- use_boxcox=Cat([True, False]),
- initialization_method=Cat(["estimated", "heuristic"]),
- method=Cat(methods),
- )
+ return {
+ "trend": Cat(["add", "mul", None]),
+ "damped_trend": Cat([True, False]),
+ "seasonal": Cat(["add", "mul", None]),
+ "sp": Cat([4, 6, 7, 12, None]),
+ "use_boxcox": Cat([True, False]),
+ "initialization_method": Cat(["estimated", "heuristic"]),
+ "method": Cat(methods),
+ }
class ETS(ForecastModel):
@@ -349,10 +347,10 @@ class ETS(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.ets"
- _estimators = {"forecast": "AutoETS"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "AutoETS"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -364,17 +362,17 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- error=Cat(["add", "mul"]),
- trend=Cat(["add", "mul", None]),
- damped_trend=Cat([True, False]),
- seasonal=Cat(["add", "mul", None]),
- sp=Cat([1, 4, 6, 7, 12]),
- initialization_method=Cat(["estimated", "heuristic"]),
- maxiter=Int(500, 2000, step=100),
- auto=Cat([True, False]),
- information_criterion=Cat(["aic", "bic", "aicc"]),
- )
+ return {
+ "error": Cat(["add", "mul"]),
+ "trend": Cat(["add", "mul", None]),
+ "damped_trend": Cat([True, False]),
+ "seasonal": Cat(["add", "mul", None]),
+ "sp": Cat([1, 4, 6, 7, 12]),
+ "initialization_method": Cat(["estimated", "heuristic"]),
+ "maxiter": Int(500, 2000, step=100),
+ "auto": Cat([True, False]),
+ "information_criterion": Cat(["aic", "bic", "aicc"]),
+ }
class NaiveForecaster(ForecastModel):
@@ -416,10 +414,10 @@ class NaiveForecaster(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.naive"
- _estimators = {"forecast": "NaiveForecaster"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "NaiveForecaster"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -431,7 +429,7 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(strategy=Cat(["last", "mean", "drift"]))
+ return {"strategy": Cat(["last", "mean", "drift"])}
class PolynomialTrend(ForecastModel):
@@ -471,10 +469,10 @@ class PolynomialTrend(ForecastModel):
native_multilabel = False
native_multioutput = True
has_validation = None
- supports_engines = ["sktime"]
+ supports_engines = ("sktime",)
_module = "sktime.forecasting.trend"
- _estimators = {"forecast": "PolynomialTrendForecaster"}
+ _estimators: ClassVar[dict[str, str]] = {"forecast": "PolynomialTrendForecaster"}
@staticmethod
def _get_distributions() -> dict[str, BaseDistribution]:
@@ -486,7 +484,7 @@ def _get_distributions() -> dict[str, BaseDistribution]:
Hyperparameter distributions.
"""
- return dict(
- degree=Int(1, 5),
- with_intercept=Cat([True, False]),
- )
+ return {
+ "degree": Int(1, 5),
+ "with_intercept": Cat([True, False]),
+ }
diff --git a/atom/nlp.py b/atom/nlp.py
index 299d3c563..28ab2e61f 100644
--- a/atom/nlp.py
+++ b/atom/nlp.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -251,7 +249,7 @@ def drop_regex(regex: str):
Regex pattern to replace.
"""
- if isinstance(X[corpus].iat[0], str):
+ if isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].str.replace(regex, "", regex=True)
else:
X[corpus] = X[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x])
@@ -261,7 +259,7 @@ def drop_regex(regex: str):
self._log("Cleaning the corpus...", 1)
if self.decode:
- if isinstance(X[corpus].iat[0], str):
+ if isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].apply(lambda x: to_ascii(x))
else:
X[corpus] = X[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc])
@@ -269,7 +267,7 @@ def drop_regex(regex: str):
if self.lower_case:
self._log(" --> Converting text to lower case.", 2)
- if isinstance(X[corpus].iat[0], str):
+ if isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].str.lower()
else:
X[corpus] = X[corpus].apply(lambda doc: [str(w).lower() for w in doc])
@@ -312,14 +310,14 @@ def drop_regex(regex: str):
if self.drop_punctuation:
self._log(" --> Dropping punctuation from the text.", 2)
trans_table = str.maketrans("", "", punctuation) # Translation table
- if isinstance(X[corpus].iat[0], str):
+ if isinstance(X[corpus].iloc[0], str):
func = lambda doc: doc.translate(trans_table)
else:
func = lambda doc: [str(w).translate(trans_table) for w in doc]
X[corpus] = X[corpus].apply(func)
# Drop empty tokens from every document
- if not isinstance(X[corpus].iat[0], str):
+ if not isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].apply(lambda doc: [w for w in doc if w])
return X
@@ -505,7 +503,7 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
self._log("Normalizing the corpus...", 1)
# If the corpus is not tokenized, separate by space
- if isinstance(X[corpus].iat[0], str):
+ if isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].apply(lambda row: row.split())
stopwords = set()
@@ -514,7 +512,7 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
self.stopwords = "english"
# Get stopwords from the NLTK library
- check_nltk_module("corpora/stopwords", self.verbose < 2)
+ check_nltk_module("corpora/stopwords", quiet=self.verbose < 2)
stopwords = set(nltk.corpus.stopwords.words(self.stopwords.lower()))
# Join predefined with customs stopwords
@@ -536,9 +534,9 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
if self.lemmatize:
self._log(" --> Applying lemmatization.", 2)
- check_nltk_module("corpora/wordnet", self.verbose < 2)
- check_nltk_module("taggers/averaged_perceptron_tagger", self.verbose < 2)
- check_nltk_module("corpora/omw-1.4", self.verbose < 2)
+ check_nltk_module("corpora/wordnet", quiet=self.verbose < 2)
+ check_nltk_module("taggers/averaged_perceptron_tagger", quiet=self.verbose < 2)
+ check_nltk_module("corpora/omw-1.4", quiet=self.verbose < 2)
wnl = WordNetLemmatizer()
f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)]
@@ -734,8 +732,8 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]:
self._log("Tokenizing the corpus...", 1)
- if isinstance(X[corpus].iat[0], str):
- check_nltk_module("tokenizers/punkt", self.verbose < 2)
+ if isinstance(X[corpus].iloc[0], str):
+ check_nltk_module("tokenizers/punkt", quiet=self.verbose < 2)
X[corpus] = X[corpus].apply(lambda row: nltk.word_tokenize(row))
ngrams = {
@@ -916,7 +914,7 @@ def __init__(
*,
return_sparse: Bool = True,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
**kwargs,
@@ -949,14 +947,14 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
corpus = get_corpus(X)
# Convert a sequence of tokens to space separated string
- if not isinstance(X[corpus].iat[0], str):
+ if not isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].apply(lambda row: " ".join(row))
- strategies = dict(
- bow="CountVectorizer",
- tfidf="TfidfVectorizer",
- hashing="HashingVectorizer",
- )
+ strategies = {
+ "bow": "CountVectorizer",
+ "tfidf": "TfidfVectorizer",
+ "hashing": "HashingVectorizer",
+ }
estimator = self._get_est_class(
name=strategies[self.strategy],
@@ -1001,7 +999,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log("Vectorizing the corpus...", 1)
# Convert a sequence of tokens to space-separated string
- if not isinstance(X[corpus].iat[0], str):
+ if not isinstance(X[corpus].iloc[0], str):
X[corpus] = X[corpus].apply(lambda row: " ".join(row))
matrix = self._estimator.transform(X[corpus])
diff --git a/atom/pipeline.py b/atom/pipeline.py
index 197b0b257..4e3179e5c 100644
--- a/atom/pipeline.py
+++ b/atom/pipeline.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -159,7 +157,7 @@ def __getattr__(self, item: str):
try:
return getattr(self._final_estimator, item)
except (AttributeError, IndexError):
- raise AttributeError(f"'Pipeline' object has no attribute '{item}'.")
+ raise AttributeError(f"'Pipeline' object has no attribute '{item}'.") from None
def __sklearn_is_fitted__(self):
"""Whether the pipeline has been fitted."""
@@ -205,7 +203,8 @@ def _final_estimator(self) -> Literal["passthrough"] | Estimator | None:
def _can_transform(self) -> bool:
"""Check if the pipeline can use the transform method."""
return (
- self._final_estimator is None or self._final_estimator == "passthrough"
+ self._final_estimator is None
+ or self._final_estimator == "passthrough"
or hasattr(self._final_estimator, "transform")
)
@@ -218,6 +217,7 @@ def _can_inverse_transform(self) -> bool:
def _iter(
self,
+ *,
with_final: Bool = True,
filter_passthrough: Bool = True,
filter_train_only: Bool = True,
@@ -251,7 +251,7 @@ def _iter(
Transformer or predictor instance.
"""
- it = super()._iter(with_final, filter_passthrough)
+ it = super()._iter(with_final=with_final, filter_passthrough=filter_passthrough)
if filter_train_only:
return (x for x in it if not getattr(x[-1], "_train_only", False))
else:
@@ -294,9 +294,11 @@ def _fit(
self.steps: list[tuple[str, Estimator]] = list(self.steps)
self._validate_steps()
- for (step_idx, name, transformer) in self._iter(False, False, False):
+ for step, name, transformer in self._iter(
+ with_final=False, filter_passthrough=False, filter_train_only=False
+ ):
if transformer is None or transformer == "passthrough":
- with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
+ with _print_elapsed_time("Pipeline", self._log_message(step)):
continue
# Don't clone when caching is disabled to preserve backward compatibility
@@ -316,13 +318,13 @@ def _fit(
transformer=cloned,
X=X,
y=y,
- message=self._log_message(step_idx),
+ message=self._log_message(step),
**fit_params_steps.get(name, {}),
)
# Replace the estimator of the step with the fitted
# estimator (necessary when loading from cache)
- self.steps[step_idx] = (name, fitted_transformer)
+ self.steps[step] = (name, fitted_transformer)
return X, y
@@ -546,7 +548,7 @@ def predict(self, X: XSelector, **predict_params) -> np.ndarray:
Predicted classes with shape=(n_samples,).
"""
- for _, name, transformer in self._iter(with_final=False):
+ for _, _, transformer in self._iter(with_final=False):
with adjust_verbosity(transformer, self.verbose):
X, _ = self._mem_transform(transformer, X)
diff --git a/atom/plots/__init__.py b/atom/plots/__init__.py
index 6ba274fc2..63e510477 100644
--- a/atom/plots/__init__.py
+++ b/atom/plots/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/plots/basefigure.py b/atom/plots/basefigure.py
index f4bbc7557..f83b3fa85 100644
--- a/atom/plots/basefigure.py
+++ b/atom/plots/basefigure.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -10,7 +8,7 @@
from __future__ import annotations
from itertools import cycle
-from typing import Any, Literal
+from typing import Any, ClassVar, Literal
import matplotlib.pyplot as plt
import plotly.express as px
@@ -59,14 +57,15 @@ class BaseFigure:
"""
- _marker = ["circle", "x", "diamond", "pentagon", "star", "hexagon"]
- _dash = ["solid", "dashdot", "dash", "dot", "longdash", "longdashdot"]
- _shape = ["", "/", "x", "\\", "-", "|", "+", "."]
+ _marker: ClassVar[list[str]] = ["circle", "x", "diamond", "pentagon", "star"]
+ _dash: ClassVar[list[str]] = ["solid", "dashdot", "dash", "dot", "longdash"]
+ _shape: ClassVar[list[str]] = ["", "/", "x", "\\", "-", "|", "+", "."]
def __init__(
self,
rows: IntLargerZero = 1,
cols: IntLargerZero = 1,
+ *,
horizontal_spacing: FloatZeroToOneExc = 0.05,
vertical_spacing: FloatZeroToOneExc = 0.07,
palette: str | Sequence[str] = "Prism",
@@ -98,7 +97,7 @@ def __init__(
self.figure, _ = plt.subplots()
self.groups: list[str] = []
- self.style: Style = dict(palette={}, marker={}, dash={}, shape={})
+ self.style: Style = {"palette": {}, "marker": {}, "dash": {}, "shape": {}}
self.marker = cycle(self._marker)
self.dash = cycle(self._dash)
self.shape = cycle(self._shape)
@@ -257,23 +256,25 @@ def get_axes(
# Update the figure with the new axes
self.figure.update_layout(
{
- f"xaxis{self.axes}": dict(
- domain=(x_pos, rnd(x_pos + ax_size)), anchor=f"y{self.axes}"
- ),
- f"yaxis{self.axes}": dict(
- domain=(y_pos, rnd(y_pos + ay_size)), anchor=f"x{self.axes}"
- ),
+ f"xaxis{self.axes}": {
+ "domain": (x_pos, rnd(x_pos + ax_size)),
+ "anchor": f"y{self.axes}",
+ },
+ f"yaxis{self.axes}": {
+ "domain": (y_pos, rnd(y_pos + ay_size)),
+ "anchor": f"x{self.axes}",
+ },
}
)
# Place a colorbar right of the axes
if coloraxis:
if title := coloraxis.pop("title", None):
- coloraxis["colorbar_title"] = dict(
- text=title,
- side="right",
- font_size=coloraxis.pop("font_size"),
- )
+ coloraxis["colorbar_title"] = {
+ "text": title,
+ "side": "right",
+ "font_size": coloraxis.pop("font_size"),
+ }
coloraxis["colorbar_x"] = rnd(x_pos + ax_size) + ax_size / 40
coloraxis["colorbar_xanchor"] = "left"
@@ -281,9 +282,7 @@ def get_axes(
coloraxis["colorbar_yanchor"] = "middle"
coloraxis["colorbar_len"] = ay_size * 0.9
coloraxis["colorbar_thickness"] = ax_size * 30 # Default width in pixels
- self.figure.update_layout(
- {f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis}
- )
+ self.figure.update_layout({f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis})
xaxis = f"x{self.axes if self.axes > 1 else ''}"
yaxis = f"y{self.axes if self.axes > 1 else ''}"
diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py
index 44656aa7f..8e43f4315 100644
--- a/atom/plots/baseplot.py
+++ b/atom/plots/baseplot.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -13,7 +11,7 @@
from collections.abc import Iterator
from contextlib import contextmanager
from pathlib import Path
-from typing import Any, Literal, overload
+from typing import Any, ClassVar, Literal, overload
import matplotlib.pyplot as plt
import plotly.express as px
@@ -44,8 +42,8 @@ class BasePlot(BaseTransformer, BaseTracker, metaclass=ABCMeta):
"""
_fig = BaseFigure()
- _custom_layout: dict[str, Any] = {}
- _custom_traces: dict[str, Any] = {}
+ _custom_layout: ClassVar[dict[str, Any]] = {}
+ _custom_traces: ClassVar[dict[str, Any]] = {}
_aesthetics = Aesthetics(
palette=list(PALETTE),
title_fontsize=24,
@@ -193,7 +191,7 @@ def _get_show(show: IntLargerZero | None, maximum: IntLargerZero = 200) -> Int:
@staticmethod
def _get_set(
- rows: str | Sequence[str] | dict[str, RowSelector]
+ rows: str | Sequence[str] | dict[str, RowSelector],
) -> Iterator[tuple[str, RowSelector]]:
"""Get the row selection.
@@ -224,7 +222,7 @@ def _get_set(
yield from rows_c.items()
- def _get_metric(self, metric: MetricSelector, max_one: Bool = False) -> list[str]:
+ def _get_metric(self, metric: MetricSelector, *, max_one: Bool = False) -> list[str]:
"""Check and return the provided metric index.
Parameters
@@ -278,6 +276,7 @@ def _get_metric(self, metric: MetricSelector, max_one: Bool = False) -> list[str
def _get_plot_models(
self,
models: ModelsSelector,
+ *,
max_one: Bool = False,
ensembles: Bool = True,
check_fitted: Bool = True,
@@ -321,32 +320,39 @@ def _get_plot_models(
return models_c
else:
- return [self] # type: ignore
+ return [self] # type: ignore[list-item]
@overload
def _get_figure(
self,
backend: Literal["plotly"] = ...,
+ *,
create_figure: Literal[True] = ...,
- ) -> go.Figure: ...
+ ) -> go.Figure:
+ ...
@overload
def _get_figure(
self,
backend: Literal["matplotlib"],
+ *,
create_figure: Literal[True] = ...,
- ) -> plt.Figure: ...
+ ) -> plt.Figure:
+ ...
@overload
def _get_figure(
self,
backend: PlotBackend,
+ *,
create_figure: Literal[False],
- ) -> None: ...
+ ) -> None:
+ ...
def _get_figure(
self,
backend: PlotBackend = "plotly",
+ *,
create_figure: Bool = True,
) -> go.Figure | plt.Figure | None:
"""Return an existing figure if in canvas, else a new figure.
@@ -415,26 +421,26 @@ def _draw_line(
"""
return go.Scatter(
- line=dict(
- width=self.line_width,
- color=BasePlot._fig.get_elem(parent),
- dash=BasePlot._fig.get_elem(child, "dash"),
- ),
- marker=dict(
- symbol=BasePlot._fig.get_elem(child, "marker"),
- size=self.marker_size,
- color=BasePlot._fig.get_elem(parent),
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
+ line={
+ "width": self.line_width,
+ "color": BasePlot._fig.get_elem(parent),
+ "dash": BasePlot._fig.get_elem(child, "dash"),
+ },
+ marker={
+ "symbol": BasePlot._fig.get_elem(child, "marker"),
+ "size": self.marker_size,
+ "color": BasePlot._fig.get_elem(parent),
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ },
hovertemplate=kwargs.pop(
"hovertemplate",
- f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}"
+ f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}",
),
name=kwargs.pop("name", child or parent),
legendgroup=kwargs.pop("legendgroup", parent),
legendgrouptitle=kwargs.pop(
"legendgrouptitle",
- dict(text=parent, font_size=self.label_fontsize) if child else None,
+ {"text": parent, "font_size": self.label_fontsize} if child else None,
),
showlegend=BasePlot._fig.showlegend(f"{parent}-{child}", legend),
**kwargs,
@@ -469,7 +475,7 @@ def _draw_straight_line(y: Scalar | str, xaxis: str, yaxis: str):
y1=1 if y == "diagonal" else y,
xref=f"{xaxis} domain",
yref=f"{yaxis} domain" if y == "diagonal" else yaxis,
- line=dict(width=1, color="black", dash="dash"),
+ line={"width": 1, "color": "black", "dash": "dash"},
opacity=0.6,
layer="below",
)
@@ -526,12 +532,14 @@ def _plot(
if isinstance(ax, tuple):
fig.update_layout(
{
- f"{ax[0]}_title": dict(
- text=kwargs.get("xlabel"), font_size=self.label_fontsize
- ),
- f"{ax[1]}_title": dict(
- text=kwargs.get("ylabel"), font_size=self.label_fontsize
- ),
+ f"{ax[0]}_title": {
+ "text": kwargs.get("xlabel"),
+ "font_size": self.label_fontsize,
+ },
+ f"{ax[1]}_title": {
+ "text": kwargs.get("ylabel"),
+ "font_size": self.label_fontsize,
+ },
f"{ax[0]}_range": kwargs.get("xlim"),
f"{ax[1]}_range": kwargs.get("ylim"),
f"{ax[0]}_automargin": True,
@@ -557,50 +565,52 @@ def _plot(
else:
title = {"text": title, **default_title}
- fig.update_layout(dict(annotations=fig.layout.annotations + (title,)))
+ fig.update_layout({"annotations": (*fig.layout.annotations, title)})
if not BasePlot._fig.is_canvas and kwargs.get("plotname"):
- default_title = dict(
- x=0.5,
- y=1,
- pad=dict(t=15, b=15),
- xanchor="center",
- yanchor="top",
- xref="paper",
- font_size=self.title_fontsize,
- )
+ default_title = {
+ "x": 0.5,
+ "y": 1,
+ "pad": {"t": 15, "b": 15},
+ "xanchor": "center",
+ "yanchor": "top",
+ "xref": "paper",
+ "font_size": self.title_fontsize,
+ }
if isinstance(title := kwargs.get("title"), dict):
title = default_title | title
else:
title = {"text": title, **default_title}
- default_legend = dict(
- traceorder="grouped",
- groupclick=kwargs.get("groupclick", "toggleitem"),
- font_size=self.label_fontsize,
- bgcolor="rgba(255, 255, 255, 0.5)",
- )
+ default_legend = {
+ "traceorder": "grouped",
+ "groupclick": kwargs.get("groupclick", "toggleitem"),
+ "font_size": self.label_fontsize,
+ "bgcolor": "rgba(255, 255, 255, 0.5)",
+ }
if isinstance(legend := kwargs.get("legend"), str):
position = {}
if legend == "upper left":
- position = dict(x=0.01, y=0.99, xanchor="left", yanchor="top")
+ position = {"x": 0.01, "y": 0.99, "xanchor": "left", "yanchor": "top"}
elif legend == "lower left":
- position = dict(x=0.01, y=0.01, xanchor="left", yanchor="bottom")
+ position = {"x": 0.01, "y": 0.01, "xanchor": "left", "yanchor": "bottom"}
elif legend == "upper right":
- position = dict(x=0.99, y=0.99, xanchor="right", yanchor="top")
+ position = {"x": 0.99, "y": 0.99, "xanchor": "right", "yanchor": "top"}
elif legend == "lower right":
- position = dict(x=0.99, y=0.01, xanchor="right", yanchor="bottom")
+ position = {"x": 0.99, "y": 0.01, "xanchor": "right", "yanchor": "bottom"}
elif legend == "upper center":
- position = dict(x=0.5, y=0.99, xanchor="center", yanchor="top")
+ position = {"x": 0.5, "y": 0.99, "xanchor": "center", "yanchor": "top"}
elif legend == "lower center":
- position = dict(x=0.5, y=0.01, xanchor="center", yanchor="bottom")
+ position = {"x": 0.5, "y": 0.01, "xanchor": "center", "yanchor": "bottom"}
elif legend == "center left":
- position = dict(x=0.01, y=0.5, xanchor="left", yanchor="middle")
+ position = {"x": 0.01, "y": 0.5, "xanchor": "left", "yanchor": "middle"}
elif legend == "center right":
- position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle")
+ position = {"x": 0.99, "y": 0.5, "xanchor": "right", "yanchor": "middle"}
elif legend == "center":
- position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle")
+ position = {"x": 0.5, "y": 0.5, "xanchor": "center", "yanchor": "middle"}
+
legend = default_legend | position
+
elif isinstance(legend, dict):
legend = default_legend | legend
@@ -611,9 +621,9 @@ def _plot(
title=title,
legend=legend,
showlegend=bool(kwargs.get("legend")),
- hoverlabel=dict(font_size=self.label_fontsize),
+ hoverlabel={"font_size": self.label_fontsize},
font_size=self.tick_fontsize,
- margin=dict(l=50, b=50, r=0, t=25 + space1 + space2, pad=0),
+ margin={"l": 50, "b": 50, "r": 0, "t": 25 + space1 + space2, "pad": 0},
width=kwargs["figsize"][0],
height=kwargs["figsize"][1],
)
diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py
index 03070e2fc..e221e0eaf 100644
--- a/atom/plots/dataplot.py
+++ b/atom/plots/dataplot.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -47,14 +45,14 @@ class DataPlot(BasePlot, metaclass=ABCMeta):
@crash
def plot_components(
- self,
- show: IntLargerZero | None = None,
- *,
- title: str | dict[str, Any] | None = None,
- legend: Legend | dict[str, Any] | None = "lower right",
- figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
- filename: str | Path | None = None,
- display: Bool | None = True,
+ self,
+ show: IntLargerZero | None = None,
+ *,
+ title: str | dict[str, Any] | None = None,
+ legend: Legend | dict[str, Any] | None = "lower right",
+ figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
+ filename: str | Path | None = None,
+ display: Bool | None = True,
) -> go.Figure | None:
"""Plot the explained variance ratio per component.
@@ -141,12 +139,12 @@ def plot_components(
fig.add_trace(
go.Bar(
x=variance,
- y=[f"pca{str(i)}" for i in range(len(variance))],
+ y=[f"pca{i}" for i in range(len(variance))],
orientation="h",
- marker=dict(
- color=[f"rgba({color[4:-1]}, {o})" for o in opacity],
- line=dict(width=2, color=color),
- ),
+ marker={
+ "color": [f"rgba({color[4:-1]}, {o})" for o in opacity],
+ "line": {"width": 2, "color": color},
+ },
hovertemplate="%{x}",
name=f"Variance retained: {variance[:self.pca_._comps].sum():.3f}",
legendgroup="components",
@@ -156,7 +154,7 @@ def plot_components(
)
)
- fig.update_layout({f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending")})
+ fig.update_layout({f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"}})
return self._plot(
ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
@@ -257,13 +255,13 @@ def plot_correlation(
fig = self._get_figure()
xaxis, yaxis = BasePlot._fig.get_axes(
x=(0, 0.87),
- coloraxis=dict(
- colorscale="rdbu_r",
- cmin=-1,
- cmax=1,
- title=f"{method} correlation",
- font_size=self.label_fontsize,
- ),
+ coloraxis={
+ "colorscale": "rdbu_r",
+ "cmin": -1,
+ "cmax": 1,
+ "title": f"{method} correlation",
+ "font_size": self.label_fontsize,
+ },
)
fig.add_trace(
@@ -419,10 +417,10 @@ def plot_distribution(
x=series,
y=series.index,
orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
+ marker={
+ "color": f"rgba({color[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": color},
+ },
hovertemplate="%{x}",
name=f"{columns_c[0]}: {len(series)} classes",
showlegend=BasePlot._fig.showlegend("dist", legend),
@@ -449,14 +447,14 @@ def plot_distribution(
go.Histogram(
x=self.branch.dataset[col],
histnorm="probability density",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(col)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(col)},
+ },
nbinsx=40,
name="dist",
legendgroup=col,
- legendgrouptitle=dict(text=col, font_size=self.label_fontsize),
+ legendgrouptitle={"text": col, "font_size": self.label_fontsize},
showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend),
xaxis=xaxis,
yaxis=yaxis,
@@ -494,7 +492,7 @@ def plot_distribution(
)
)
- fig.update_layout(dict(barmode="overlay"))
+ fig.update_layout({"barmode": "overlay"})
return self._plot(
ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
@@ -621,7 +619,7 @@ def get_text(column: Series) -> Series:
Corpus of tokens.
"""
- if isinstance(column.iat[0], str):
+ if isinstance(column.iloc[0], str):
return column.apply(lambda row: row.split())
else:
return column
@@ -654,13 +652,13 @@ def get_text(column: Series) -> Series:
fig.add_trace(
go.Bar(
- x=(data := series[-self._get_show(show, len(series)):]),
+ x=(data := series[-self._get_show(show, len(series)) :]),
y=data.index,
orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(ngram_c)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(ngram_c)},
+ },
hovertemplate="%{x}",
name=f"Total {ngram_c}: {len(series)}",
legendgroup=ngram_c,
@@ -683,13 +681,13 @@ def get_text(column: Series) -> Series:
@crash
def plot_pca(
- self,
- *,
- title: str | dict[str, Any] | None = None,
- legend: Legend | dict[str, Any] | None = None,
- figsize: tuple[IntLargerZero, IntLargerZero] = (900, 600),
- filename: str | Path | None = None,
- display: Bool | None = True,
+ self,
+ *,
+ title: str | dict[str, Any] | None = None,
+ legend: Legend | dict[str, Any] | None = None,
+ figsize: tuple[IntLargerZero, IntLargerZero] = (900, 600),
+ filename: str | Path | None = None,
+ display: Bool | None = True,
) -> go.Figure | None:
"""Plot the explained variance ratio vs number of components.
@@ -768,13 +766,13 @@ def plot_pca(
x=tuple(range(1, self.pca_.n_features_in_ + 1)),
y=np.cumsum(self.pca_.explained_variance_ratio_),
mode="lines+markers",
- line=dict(width=self.line_width, color=BasePlot._fig.get_elem("pca")),
- marker=dict(
- symbol=symbols,
- size=sizes,
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- opacity=1,
- ),
+ line={"width": self.line_width, "color": BasePlot._fig.get_elem("pca")},
+ marker={
+ "symbol": symbols,
+ "size": sizes,
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ "opacity": 1,
+ },
hovertemplate="%{y}",
showlegend=False,
xaxis=xaxis,
@@ -1020,22 +1018,22 @@ def plot_relationships(
xaxis, yaxis = BasePlot._fig.get_axes(
x=(x_pos, rnd(x_pos + size)),
y=(y_pos, rnd(y_pos + size)),
- coloraxis=dict(
- colorscale=PALETTE.get(color, "Blues"),
- cmin=0,
- cmax=len(self.branch.dataset),
- showscale=False,
- )
+ coloraxis={
+ "colorscale": PALETTE.get(color, "Blues"),
+ "cmin": 0,
+ "cmax": len(self.branch.dataset),
+ "showscale": False,
+ },
)
if x == y:
fig.add_trace(
go.Histogram(
x=self.branch.dataset[columns_c[x]],
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
+ marker={
+ "color": f"rgba({color[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": color},
+ },
name=columns_c[x],
showlegend=False,
xaxis=xaxis,
@@ -1048,7 +1046,7 @@ def plot_relationships(
x=sample(columns_c[y]),
y=sample(columns_c[x]),
mode="markers",
- marker=dict(color=color),
+ marker={"color": color},
hovertemplate="(%{x}, %{y})",
showlegend=False,
xaxis=xaxis,
@@ -1188,13 +1186,13 @@ def plot_rfecv(
x=list(x),
y=mean,
mode="lines+markers",
- line=dict(width=self.line_width, color=BasePlot._fig.get_elem("rfecv")),
- marker=dict(
- symbol=symbols,
- size=sizes,
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- opacity=1,
- ),
+ line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")},
+ marker={
+ "symbol": symbols,
+ "size": sizes,
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ "opacity": 1,
+ },
name=ylabel,
legendgroup="rfecv",
showlegend=BasePlot._fig.showlegend("rfecv", legend),
@@ -1210,7 +1208,7 @@ def plot_rfecv(
x=tuple(x),
y=mean + std,
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
+ line={"width": 1, "color": BasePlot._fig.get_elem("rfecv")},
hovertemplate="%{y}upper bound",
legendgroup="rfecv",
showlegend=False,
@@ -1221,7 +1219,7 @@ def plot_rfecv(
x=tuple(x),
y=mean - std,
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
+ line={"width": 1, "color": BasePlot._fig.get_elem("rfecv")},
fill="tonexty",
fillcolor=f"rgba{BasePlot._fig.get_elem('rfecv')[3:-1]}, 0.2)",
hovertemplate="%{y}lower bound",
@@ -1334,7 +1332,7 @@ def plot_wordcloud(
def get_text(column):
"""Get the complete corpus as one long string."""
- if isinstance(column.iat[0], str):
+ if isinstance(column.iloc[0], str):
return " ".join(column)
else:
return " ".join([" ".join(row) for row in column])
diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py
index d8e4163d2..832397dfb 100644
--- a/atom/plots/hyperparametertuningplot.py
+++ b/atom/plots/hyperparametertuningplot.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -69,8 +67,7 @@ def _check_hyperparams(models: list[Model]) -> list[Model]:
"""
if not (models_c := [m for m in models if m._study is not None]):
raise PermissionError(
- "This plot method is only available for "
- "models that ran hyperparameter tuning."
+ "This plot method is only available for models that ran hyperparameter tuning."
)
return models_c
@@ -371,7 +368,7 @@ def plot_hyperparameter_importance(
models_c = self._get_plot_models(models, ensembles=False)
models_c = self._check_hyperparams(models_c)
metric_c = self._get_metric(metric, max_one=True)[0]
- params_c = len(set([k for m in models_c for k in m._ht["distributions"]]))
+ params_c = len({k for m in models_c for k in m._ht["distributions"]})
show_c = self._get_show(show, params_c)
fig = self._get_figure()
@@ -385,10 +382,10 @@ def plot_hyperparameter_importance(
x=np.array(list(importances.values())) / sum(importances.values()),
y=list(importances),
orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+ },
hovertemplate="%{x}",
name=m.name,
legendgroup=m.name,
@@ -400,7 +397,7 @@ def plot_hyperparameter_importance(
fig.update_layout(
{
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"},
"bargroupgap": 0.05,
}
)
@@ -526,15 +523,13 @@ def plot_hyperparameters(
xaxis, yaxis = BasePlot._fig.get_axes(
x=(x_pos, rnd(x_pos + size)),
y=(y_pos, rnd(y_pos + size)),
- coloraxis=dict(
- axes="99",
- colorscale=PALETTE.get(
- BasePlot._fig.get_elem(model.name), "Blues"
- ),
- cmin=model.trials[metric_c].min(),
- cmax=model.trials[metric_c].max(),
- showscale=False,
- )
+ coloraxis={
+ "axes": "99",
+ "colorscale": PALETTE.get(BasePlot._fig.get_elem(model.name), "Blues"),
+ "cmin": model.trials[metric_c].min(),
+ "cmax": model.trials[metric_c].max(),
+ "showscale": False,
+ },
)
fig.add_trace(
@@ -542,13 +537,13 @@ def plot_hyperparameters(
x=model.trials[params_c[y]],
y=model.trials[params_c[x + 1]],
mode="markers",
- marker=dict(
- size=self.marker_size,
- color=BasePlot._fig.get_elem(model.name),
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
+ marker={
+ "size": self.marker_size,
+ "color": BasePlot._fig.get_elem(model.name),
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ },
customdata=list(
- zip(list(model.trials.index), model.trials[metric_c])
+ zip(model.trials.index, model.trials[metric_c], strict=True)
),
hovertemplate=(
f"{params_c[y]}:%{{x}}
"
@@ -567,10 +562,10 @@ def plot_hyperparameters(
x=model.trials[params_c[y]],
y=model.trials[params_c[x + 1]],
z=model.trials[metric_c],
- contours=dict(
- showlabels=True,
- labelfont=dict(size=self.tick_fontsize, color="white")
- ),
+ contours={
+ "showlabels": True,
+ "labelfont": {"size": self.tick_fontsize, "color": "white"},
+ },
coloraxis="coloraxis99",
hoverinfo="skip",
showlegend=False,
@@ -719,7 +714,7 @@ def sort_mixed_types(values: list[str]) -> list[str]:
for elem in values:
try:
numbers.append(it(float(elem)))
- except (TypeError, ValueError):
+ except (TypeError, ValueError): # noqa: PERF203
categorical.append(str(elem))
return list(map(str, sorted(numbers))) + sorted(categorical)
@@ -739,7 +734,7 @@ def sort_mixed_types(values: list[str]) -> list[str]:
)
# Clean and sort dimensions for nicer view
- dims = [dims[0]] + sorted(dims[1:], key=lambda x: params_c.index(x["label"]))
+ dims = [dims[0], *sorted(dims[1:], key=lambda x: params_c.index(x["label"]))]
for d in dims:
if "ticktext" in d:
# Skip processing for logarithmic params
@@ -756,25 +751,25 @@ def sort_mixed_types(values: list[str]) -> list[str]:
fig = self._get_figure()
xaxis, yaxis = BasePlot._fig.get_axes(
- coloraxis=dict(
- colorscale=PALETTE.get(BasePlot._fig.get_elem(model.name), "Blues"),
- cmin=min(dims[0]["values"]),
- cmax=max(dims[0]["values"]),
- title=metric_c,
- font_size=self.label_fontsize,
- )
+ coloraxis={
+ "colorscale": PALETTE.get(BasePlot._fig.get_elem(model.name), "Blues"),
+ "cmin": min(dims[0]["values"]),
+ "cmax": max(dims[0]["values"]),
+ "title": metric_c,
+ "font_size": self.label_fontsize,
+ }
)
fig.add_trace(
go.Parcoords(
dimensions=dims,
- line=dict(
- color=dims[0]["values"],
- coloraxis=f"coloraxis{xaxis[1:]}",
- ),
- unselected=dict(line=dict(color="gray", opacity=0.5)),
+ line={
+ "color": dims[0]["values"],
+ "coloraxis": f"coloraxis{xaxis[1:]}",
+ },
+ unselected={"line": {"color": "gray", "opacity": 0.5}},
labelside="bottom",
- labelfont=dict(size=self.label_fontsize),
+ labelfont={"size": self.label_fontsize},
)
)
@@ -912,12 +907,12 @@ def plot_pareto_front(
x=model.trials[metric_c[y]],
y=model.trials[metric_c[x + 1]],
mode="markers",
- marker=dict(
- size=self.marker_size,
- color=model.trials.index,
- colorscale="Teal",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
+ marker={
+ "size": self.marker_size,
+ "color": model.trials.index,
+ "colorscale": "Teal",
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ },
customdata=model.trials.index,
hovertemplate="(%{x}, %{y})Trial %{customdata}",
xaxis=xaxis,
@@ -1065,12 +1060,12 @@ def plot_slice(
x=model.trials[params_c[y]],
y=model.trials[metric_c[x]],
mode="markers",
- marker=dict(
- size=self.marker_size,
- color=model.trials.index,
- colorscale="Teal",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
+ marker={
+ "size": self.marker_size,
+ "color": model.trials.index,
+ "colorscale": "Teal",
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ },
customdata=model.trials.index,
hovertemplate="(%{x}, %{y})Trial %{customdata}",
xaxis=xaxis,
@@ -1211,7 +1206,7 @@ def plot_terminator_improvement(
self._draw_line(
x=m.trials.index,
y=info.improvements,
- error_y=dict(type="data", array=info.errors),
+ error_y={"type": "data", "array": info.errors},
mode="markers+lines",
parent=m.name,
legend=legend,
@@ -1333,7 +1328,7 @@ def plot_timeline(
date_start = trial.datetime_start or date_complete
# Create nice representation of scores and params for hover
- s = [f'{m}: {trial.values[i]}' for i, m in enumerate(self._metric.keys())]
+ s = [f"{m}: {trial.values[i]}" for i, m in enumerate(self._metric.keys())]
p = [f" --> {k}: {v}" for k, v in trial.params.items()]
info.append(
@@ -1346,7 +1341,7 @@ def plot_timeline(
f"Trial: {trial.number}
"
f"{'
'.join(s)}"
f"Parameters:
{'
'.join(p)}"
- )
+ ),
)
)
@@ -1362,10 +1357,10 @@ def plot_timeline(
textposition="none",
hovertemplate=f"%{{text}}{m.name}",
orientation="h",
- marker=dict(
- color=f"rgba({_cm[state.name][4:-1]}, 0.2)",
- line=dict(width=2, color=_cm[state.name]),
- ),
+ marker={
+ "color": f"rgba({_cm[state.name][4:-1]}, 0.2)",
+ "line": {"width": 2, "color": _cm[state.name]},
+ },
showlegend=BasePlot._fig.showlegend(_cm[state.name], legend),
xaxis=xaxis,
yaxis=yaxis,
diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py
index 5a7c3a817..fe4f7d038 100644
--- a/atom/plots/predictionplot.py
+++ b/atom/plots/predictionplot.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -22,6 +20,7 @@
import plotly.graph_objects as go
from beartype import beartype
from joblib import Parallel, delayed
+from numpy.random import default_rng
from plotly.colors import unconvert_from_RGB_255, unlabel_rgb
from scipy import stats
from scipy.stats.mstats import mquantiles
@@ -198,11 +197,11 @@ def plot_calibration(
fig.add_trace(
go.Histogram(
x=y_pred,
- xbins=dict(start=0, end=1, size=1. / n_bins),
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
+ xbins={"start": 0, "end": 1, "size": 1.0 / n_bins},
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+ },
name=m.name,
legendgroup=m.name,
showlegend=False,
@@ -355,13 +354,13 @@ def plot_confusion_matrix(
if len(models_c) == 1:
xaxis, yaxis = BasePlot._fig.get_axes(
x=(0, 0.87),
- coloraxis=dict(
- colorscale="Blues",
- cmin=0,
- cmax=100,
- title="Percentage of samples",
- font_size=self.label_fontsize,
- ),
+ coloraxis={
+ "colorscale": "Blues",
+ "cmin": 0,
+ "cmax": 100,
+ "title": "Percentage of samples",
+ "font_size": self.label_fontsize,
+ },
)
else:
xaxis, yaxis = BasePlot._fig.get_axes()
@@ -373,17 +372,6 @@ def plot_confusion_matrix(
cm = confusion_matrix(y_true, y_pred)
if len(models_c) == 1: # Create matrix heatmap
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(0, 0.87),
- coloraxis=dict(
- colorscale="Blues",
- cmin=0,
- cmax=100,
- title="Percentage of samples",
- font_size=self.label_fontsize,
- ),
- )
-
# Get mapping from branch or use unique values
ticks = m.branch.mapping.get(
target_c, np.unique(m.branch.dataset[target_c]).astype(str)
@@ -393,14 +381,16 @@ def plot_confusion_matrix(
go.Heatmap(
x=ticks,
y=ticks,
- z=100. * cm / cm.sum(axis=1)[:, np.newaxis],
+ z=100.0 * cm / cm.sum(axis=1)[:, np.newaxis],
coloraxis=f"coloraxis{xaxis[1:]}",
text=cm,
customdata=labels,
texttemplate="%{text}
(%{z:.2f}%)",
- textfont=dict(size=self.label_fontsize),
+ textfont={"size": self.label_fontsize},
hovertemplate=(
- "%{customdata}" if self.task.is_binary else ""
+ "%{customdata}"
+ if self.task.is_binary
+ else ""
"Predicted label:%{x}
True label:%{y}
Percentage:%{z}"
""
),
@@ -425,10 +415,10 @@ def plot_confusion_matrix(
x=cm.ravel(),
y=labels.ravel(),
orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+ },
hovertemplate="%{x}",
name=m.name,
legendgroup=m.name,
@@ -687,6 +677,7 @@ def plot_errors(
# Fit the points using linear regression
from atom.models import OrdinaryLeastSquares
+
model = OrdinaryLeastSquares(goal=self._goal)
estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred)
@@ -928,17 +919,17 @@ def plot_feature_importance(
"Invalid value for the models parameter. Estimator "
f"{m._est_class.__name__} has no scores_, feature_importances_ "
"nor coef_ attribute."
- )
+ ) from None
fig.add_trace(
go.Bar(
x=fi,
y=fi.index,
orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+ },
hovertemplate="%{x}",
name=m.name,
legendgroup=m.name,
@@ -950,13 +941,13 @@ def plot_feature_importance(
fig.update_layout(
{
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"},
"bargroupgap": 0.05,
}
)
# Unique number of features over all branches
- n_fxs = len(set([fx for m in models_c for fx in m.branch.features]))
+ n_fxs = len({fx for m in models_c for fx in m.branch.features})
BasePlot._fig.used_models.extend(models_c)
return self._plot(
@@ -979,8 +970,8 @@ def plot_forecast(
fh: RowSelector | ForecastingHorizon = "test",
X: XSelector | None = None,
target: TargetSelector = 0,
- plot_interval: Bool = True,
*,
+ plot_interval: Bool = True,
title: str | dict[str, Any] | None = None,
legend: Legend | dict[str, Any] | None = "upper left",
figsize: tuple[IntLargerZero, IntLargerZero] = (900, 600),
@@ -1086,11 +1077,11 @@ def plot_forecast(
x=self._get_plot_index(getattr(self, ds)),
y=getattr(self, ds)[target_c],
mode="lines+markers",
- line=dict(
- width=2,
- color="black",
- dash=BasePlot._fig.get_elem(ds, "dash"),
- ),
+ line={
+ "width": 2,
+ "color": "black",
+ "dash": BasePlot._fig.get_elem(ds, "dash"),
+ },
opacity=0.6,
name=ds,
showlegend=False if models else BasePlot._fig.showlegend(ds, legend),
@@ -1102,11 +1093,8 @@ def plot_forecast(
# Draw predictions
for m in models_c:
# TODO: Fix the way we get fh
- # if isinstance(fh, str):
- # # Get fh and corresponding X from data set
- # datasets = self._get_set(fh, max_one=False)
- # fh = bk.concat([getattr(m, ds) for ds in datasets]).index
- # X = m.X.loc[fh]
+ if isinstance(fh, str):
+ pass
y_pred = m.predict(fh, X)
if self.task.is_multioutput:
@@ -1142,7 +1130,7 @@ def plot_forecast(
x=self._get_plot_index(y_pred),
y=y.iloc[:, 1],
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(m.name)},
hovertemplate=f"%{{y}}{m.name} - upper bound",
legendgroup=m.name,
showlegend=False,
@@ -1153,7 +1141,7 @@ def plot_forecast(
x=self._get_plot_index(y_pred),
y=y.iloc[:, 0],
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(m.name)},
fill="tonexty",
fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
hovertemplate=f"%{{y}}{m.name} - lower bound",
@@ -1161,7 +1149,7 @@ def plot_forecast(
showlegend=False,
xaxis=xaxis,
yaxis=yaxis,
- )
+ ),
]
)
@@ -1403,7 +1391,7 @@ def plot_learning_curve(
y=y[group],
mode="lines+markers",
marker_symbol="circle",
- error_y=dict(type="data", array=std[group], visible=True),
+ error_y={"type": "data", "array": std[group], "visible": True},
parent=group,
child=self._metric[met].name,
legend=legend,
@@ -1421,7 +1409,7 @@ def plot_learning_curve(
x=x[group],
y=np.add(y[group], std[group]),
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(group)},
hovertemplate="%{y}upper bound",
legendgroup=group,
showlegend=False,
@@ -1432,7 +1420,7 @@ def plot_learning_curve(
x=x[group],
y=np.subtract(y[group], std[group]),
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(group)},
fill="tonexty",
fillcolor=fillcolor,
hovertemplate="%{y}lower bound",
@@ -1685,11 +1673,11 @@ class is always the positive one.
else:
xaxis, yaxis = BasePlot._fig.get_axes(
x=(0, 0.87),
- coloraxis=dict(
- colorscale="Reds",
- title="Normalized feature importance",
- font_size=self.label_fontsize,
- )
+ coloraxis={
+ "colorscale": "Reds",
+ "title": "Normalized feature importance",
+ "font_size": self.label_fontsize,
+ },
)
for m in models_c:
@@ -1722,7 +1710,7 @@ class is always the positive one.
# Semi-partial correlation matrix
with np.errstate(divide="ignore"):
V_sqrt = np.sqrt(np.diag(V))[..., None]
- Vi_sqrt = np.sqrt(np.abs(diag - Vi ** 2 / diag[..., None])).T
+ Vi_sqrt = np.sqrt(np.abs(diag - Vi**2 / diag[..., None])).T
semi_partial_correlation = partial_corr / V_sqrt / Vi_sqrt
# X covariates are removed
@@ -1740,12 +1728,12 @@ class is always the positive one.
x=parshap["train"],
y=parshap["test"],
mode="markers+text",
- marker=dict(
- color=color,
- size=self.marker_size,
- coloraxis=f"coloraxis{xaxis[1:]}",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
+ marker={
+ "color": color,
+ "size": self.marker_size,
+ "coloraxis": f"coloraxis{xaxis[1:]}",
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ },
text=m.branch.features,
textposition="top center",
customdata=(data := None if isinstance(color, str) else list(color)),
@@ -1937,7 +1925,7 @@ def plot_partial_dependence(
# Create new axes
if not axes:
- for i, col in enumerate(cols):
+ for i in range(len(cols)):
# Calculate the distance between subplots
offset = divide(0.025, len(cols) - 1)
@@ -1957,7 +1945,8 @@ def plot_partial_dependence(
X=m.branch.X_test,
features=col,
kind="both" if "individual" in kind else "average",
- ) for col in cols
+ )
+ for col in cols
)
# Compute deciles for ticks (only if line plots)
@@ -1968,7 +1957,7 @@ def plot_partial_dependence(
X_col = _safe_indexing(m.branch.X_test, fx, axis=1)
deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1))
- for i, (ax, fxs, pred) in enumerate(zip(axes, cols, predictions)):
+ for i, (ax, fxs, pred) in enumerate(zip(axes, cols, predictions)): # noqa: B905
# Draw line or contour plot
if len(pred["values"]) == 1:
# For both average and individual: draw ticks on the horizontal axis
@@ -1981,7 +1970,7 @@ def plot_partial_dependence(
y0=0,
y1=0.05,
yref=f"{axes[0][1]} domain",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(m.name)},
opacity=0.6,
layer="below",
)
@@ -1993,7 +1982,7 @@ def plot_partial_dependence(
x=pred["values"][0],
y=pred["average"][target_c].ravel(),
mode="lines",
- line=dict(width=2, color=color),
+ line={"width": 2, "color": color},
name=m.name,
legendgroup=m.name,
showlegend=BasePlot._fig.showlegend(m.name, legend),
@@ -2005,7 +1994,7 @@ def plot_partial_dependence(
# Draw all individual (per sample) lines (ICE)
if "individual" in kind:
# Select up to 50 random samples to plot
- idx = np.random.choice(
+ idx = default_rng().choice(
list(range(len(pred["individual"][target_c]))),
size=min(len(pred["individual"][target_c]), 50),
replace=False,
@@ -2016,7 +2005,7 @@ def plot_partial_dependence(
x=pred["values"][0],
y=sample,
mode="lines",
- line=dict(width=0.5, color=color),
+ line={"width": 0.5, "color": color},
name=m.name,
legendgroup=m.name,
showlegend=BasePlot._fig.showlegend(m.name, legend),
@@ -2032,10 +2021,13 @@ def plot_partial_dependence(
x=pred["values"][0],
y=pred["values"][1],
z=pred["average"][target_c],
- contours=dict(
- showlabels=True,
- labelfont=dict(size=self.tick_fontsize, color="white")
- ),
+ contours={
+ "showlabels": True,
+ "labelfont": {
+ "size": self.tick_fontsize,
+ "color": "white",
+ },
+ },
hovertemplate="x:%{x}
y:%{y}
z:%{z}",
hoverongaps=False,
colorscale=colorscale,
@@ -2182,13 +2174,13 @@ def plot_permutation_importance(
fig.update_layout(
{
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"},
"boxmode": "group",
}
)
# Unique number of features over all branches
- n_fxs = len(set([fx for m in models_c for fx in m.branch.features]))
+ n_fxs = len({fx for m in models_c for fx in m.branch.features})
BasePlot._fig.used_models.extend(models_c)
return self._plot(
@@ -2207,9 +2199,9 @@ def plot_permutation_importance(
def plot_pipeline(
self,
models: ModelsSelector = None,
+ *,
draw_hyperparameter_tuning: bool = True,
color_branches: bool | None = None,
- *,
title: str | dict[str, Any] | None = None,
legend: Legend | dict[str, Any] | None = None,
figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
@@ -2437,10 +2429,7 @@ def add_wire(x, y):
if model.scaler:
add_wire(x_pos[-3], check_y((d.here[0], d.here[1] - offset)))
d.add(
- RoundBox(w=7)
- .label("Scaler", color="k")
- .color(branch["color"])
- .drop("E")
+ RoundBox(w=7).label("Scaler", color="k").color(branch["color"]).drop("E")
)
offset = 0
@@ -2761,17 +2750,20 @@ def plot_probabilities(
x=(x := np.linspace(0, 1, 100)),
y=stats.gaussian_kde(hist)(x),
mode="lines",
- line=dict(
- width=2,
- color=BasePlot._fig.get_elem(m.name),
- dash=BasePlot._fig.get_elem(str(v), "dash"),
- ),
+ line={
+ "width": 2,
+ "color": BasePlot._fig.get_elem(m.name),
+ "dash": BasePlot._fig.get_elem(str(v), "dash"),
+ },
fill="tonexty",
fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
- fillpattern=dict(shape=BasePlot._fig.get_elem(str(v), "shape")),
+ fillpattern={"shape": BasePlot._fig.get_elem(str(v), "shape")},
name=f"{col}={v}",
legendgroup=m.name,
- legendgrouptitle=dict(text=m.name, font_size=self.label_fontsize),
+ legendgrouptitle={
+ "text": m.name,
+ "font_size": self.label_fontsize,
+ },
showlegend=BasePlot._fig.showlegend(f"{m.name}-{v}", legend),
xaxis=xaxis,
yaxis=yaxis,
@@ -2911,10 +2903,10 @@ def plot_residuals(
go.Histogram(
y=res,
bingroup="residuals",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
+ marker={
+ "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+ },
name=m.name,
legendgroup=m.name,
showlegend=False,
@@ -3072,10 +3064,10 @@ def get_std(model: Model, metric: str) -> Scalar:
x=[m.results[met] for m in models_c],
y=[m.name for m in models_c],
orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
+ marker={
+ "color": f"rgba({color[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": color},
+ },
hovertemplate=f"%{{x}}{met}",
name=met,
legendgroup=met,
@@ -3109,15 +3101,15 @@ def get_std(model: Model, metric: str) -> Scalar:
go.Bar(
x=[m._best_score(met) for m in models_c],
y=[m.name for m in models_c],
- error_x=dict(
- type="data",
- array=[get_std(m, met) for m in models_c],
- ),
+ error_x={
+ "type": "data",
+ "array": [get_std(m, met) for m in models_c],
+ },
orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
+ marker={
+ "color": f"rgba({color[4:-1]}, 0.2)",
+ "line": {"width": 2, "color": color},
+ },
hovertemplate="%{x}",
name=met,
legendgroup=met,
@@ -3129,7 +3121,7 @@ def get_std(model: Model, metric: str) -> Scalar:
fig.update_layout(
{
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"},
"bargroupgap": 0.05,
"boxmode": "group",
}
@@ -3372,7 +3364,7 @@ def plot_successive_halving(
y=y[group],
mode="lines+markers",
marker_symbol="circle",
- error_y=dict(type="data", array=std[group], visible=True),
+ error_y={"type": "data", "array": std[group], "visible": True},
parent=group,
child=self._metric[met].name,
legend=legend,
@@ -3390,7 +3382,7 @@ def plot_successive_halving(
x=x[group],
y=np.add(y[group], std[group]),
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(group)},
hovertemplate="%{y}upper bound",
legendgroup=group,
showlegend=False,
@@ -3401,7 +3393,7 @@ def plot_successive_halving(
x=x[group],
y=np.subtract(y[group], std[group]),
mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ line={"width": 1, "color": BasePlot._fig.get_elem(group)},
fill="tonexty",
fillcolor=fillcolor,
hovertemplate="%{y}lower bound",
@@ -3413,7 +3405,7 @@ def plot_successive_halving(
]
)
- fig.update_layout({f"xaxis{yaxis[1:]}": dict(dtick=1, autorange="reversed")})
+ fig.update_layout({f"xaxis{yaxis[1:]}": {"dtick": 1, "autorange": "reversed"}})
BasePlot._fig.used_models.extend(models_c)
return self._plot(
diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py
index 0c7756e8d..81ab3552a 100644
--- a/atom/plots/shapplot.py
+++ b/atom/plots/shapplot.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -771,7 +769,7 @@ def plot_shap_waterfall(
"""Plot SHAP's waterfall plot.
The SHAP value of a feature represents the impact of the
- evidence provided by that feature on the model’s output. The
+ evidence provided by that feature on the model's output. The
waterfall plot is designed to visually display how the SHAP
values (evidence) of each feature move the model output from
our prior expectation under the background data distribution,
diff --git a/atom/training.py b/atom/training.py
index 2132d0d74..ac57c8f7a 100644
--- a/atom/training.py
+++ b/atom/training.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -43,14 +41,45 @@ class Direct(BaseEstimator, BaseTrainer):
"""
def __init__(
- self, models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory, verbose,
- warnings, logger, experiment, random_state,
+ self,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
):
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@composed(crash, method_to_log)
@@ -88,15 +117,47 @@ class SuccessiveHalving(BaseEstimator, BaseTrainer):
"""
def __init__(
- self, models, metric, skip_runs, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ self,
+ models,
+ metric,
+ skip_runs,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
):
self.skip_runs = skip_runs
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@composed(crash, method_to_log)
@@ -131,7 +192,7 @@ def run(self, *arrays):
run = 0
models = ClassMap()
og_models = ClassMap(copy(m) for m in self._models)
- while len(self._models) > 2 ** self.skip_runs - 1:
+ while len(self._models) > 2**self.skip_runs - 1:
# Create the new set of models for the run
for m in self._models:
m._name += str(len(self._models))
@@ -170,15 +231,47 @@ class TrainSizing(BaseEstimator, BaseTrainer):
"""
def __init__(
- self, models, metric, train_sizes, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state
+ self,
+ models,
+ metric,
+ train_sizes,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
):
self.train_sizes = train_sizes
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@composed(crash, method_to_log)
@@ -344,21 +437,22 @@ class DirectClassifier(Direct):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -457,7 +551,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -467,9 +561,24 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -576,21 +685,22 @@ class DirectForecaster(Direct):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -686,7 +796,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -696,9 +806,24 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -804,21 +929,22 @@ class DirectRegressor(Direct):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -917,7 +1043,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -927,9 +1053,24 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, est_params, n_trials, ht_params, n_bootstrap,
- parallel, errors, n_jobs, device, engine, backend, memory,
- verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -1042,21 +1183,22 @@ class SuccessiveHalvingClassifier(SuccessiveHalving):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -1156,7 +1298,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -1166,9 +1308,25 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, skip_runs, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ skip_runs,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -1277,21 +1435,22 @@ class SuccessiveHalvingForecaster(SuccessiveHalving):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -1388,7 +1547,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -1398,9 +1557,25 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, skip_runs, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ skip_runs,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -1509,21 +1684,22 @@ class SuccessiveHalvingRegressor(SuccessiveHalving):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -1623,7 +1799,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -1633,9 +1809,25 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, skip_runs, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ skip_runs,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -1753,21 +1945,22 @@ class TrainSizingClassifier(TrainSizing):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -1867,7 +2060,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -1877,9 +2070,25 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, train_sizes, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ train_sizes,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -1994,21 +2203,22 @@ class TrainSizingForecaster(TrainSizing):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -2105,7 +2315,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -2115,9 +2325,25 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, train_sizes, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ train_sizes,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
@@ -2231,21 +2457,22 @@ class TrainSizingRegressor(TrainSizing):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: dict, default={"data": "numpy", "estimator": "sklearn"}
+ engine: dict or None, default=None
Execution engine to use for [data][data-acceleration] and
[estimators][estimator-acceleration]. The value should be a
dictionary with keys `data` and/or `estimator`, with their
- corresponding choice as values. Choose from:
+ corresponding choice as values. If None, the default values
+ are used.Choose from:
- "data":
- - "numpy"
+ - "numpy" (default)
- "pyarrow"
- "modin"
- "estimator":
- - "sklearn"
+ - "sklearn" (default)
- "sklearnex"
- "cuml"
@@ -2345,7 +2572,7 @@ def __init__(
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: NJobs = 1,
device: str = "cpu",
- engine: Engine = {"data": "numpy", "estimator": "sklearn"},
+ engine: Engine | None = None,
backend: Backend = "loky",
memory: Bool | str | Path | Memory = False,
verbose: Verbose = 0,
@@ -2355,7 +2582,23 @@ def __init__(
random_state: IntLargerEqualZero | None = None,
):
super().__init__(
- models, metric, train_sizes, est_params, n_trials, ht_params,
- n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
- memory, verbose, warnings, logger, experiment, random_state,
+ models,
+ metric,
+ train_sizes,
+ est_params,
+ n_trials,
+ ht_params,
+ n_bootstrap,
+ parallel,
+ errors,
+ n_jobs,
+ device,
+ engine,
+ backend,
+ memory,
+ verbose,
+ warnings,
+ logger,
+ experiment,
+ random_state,
)
diff --git a/atom/utils/__init__.py b/atom/utils/__init__.py
index 35293f08a..978fb6325 100644
--- a/atom/utils/__init__.py
+++ b/atom/utils/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/utils/constants.py b/atom/utils/constants.py
index 18af27e1f..ee5df4220 100644
--- a/atom/utils/constants.py
+++ b/atom/utils/constants.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/atom/utils/types.py b/atom/utils/types.py
index e811019bb..48c6ea8ec 100644
--- a/atom/utils/types.py
+++ b/atom/utils/types.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -54,12 +52,19 @@ class Sequence(Protocol[_T]):
"""
- def __len__(self) -> int: ...
- def __iter__(self) -> Iterator[_T]: ...
+ def __len__(self) -> int:
+ ...
+
+ def __iter__(self) -> Iterator[_T]:
+ ...
+
@overload
- def __getitem__(self, __i: SupportsIndex, /) -> _T: ...
+ def __getitem__(self, __i: SupportsIndex, /) -> _T:
+ ...
+
@overload
- def __getitem__(self, __s: slice, /) -> Sequence[_T]: ...
+ def __getitem__(self, __s: slice, /) -> Sequence[_T]:
+ ...
@classmethod
def __class_getitem__(cls, item: Any) -> Annotated[Any, Is]:
@@ -67,7 +72,7 @@ def __class_getitem__(cls, item: Any) -> Annotated[Any, Is]:
return Annotated[
cls,
Is[lambda lst: isinstance(lst, sequence_t)]
- & Is[lambda lst: all(is_bearable(i, item) for i in lst)]
+ & Is[lambda lst: all(is_bearable(i, item) for i in lst)],
]
@@ -100,8 +105,11 @@ class Style(TypedDict):
class SkScorer(Protocol):
"""Protocol for sklearn's scorers."""
- def __call__(self, *args, **kwargs): ...
- def _score(self, *args, **kwargs): ...
+ def __call__(self, *args, **kwargs):
+ ...
+
+ def _score(self, *args, **kwargs):
+ ...
@runtime_checkable
@@ -121,24 +129,33 @@ class Scorer(SkScorer, Protocol):
class Estimator(Protocol):
"""Protocol for sklearn-like estimators."""
- def __init__(self, *args, **kwargs): ...
- def get_params(self, *args, **kwargs): ...
- def set_params(self, *args, **kwargs): ...
+ def __init__(self, *args, **kwargs):
+ ...
+
+ def get_params(self, *args, **kwargs):
+ ...
+
+ def set_params(self, *args, **kwargs):
+ ...
@runtime_checkable
class Transformer(Estimator, Protocol):
"""Protocol for sklearn-like transformers."""
- def transform(self, *args, **kwargs): ...
+ def transform(self, *args, **kwargs):
+ ...
@runtime_checkable
class Predictor(Estimator, Protocol):
"""Protocol for sklearn-like predictors."""
- def fit(self, *args, **kwargs): ...
- def predict(self, *args, **kwargs): ...
+ def fit(self, *args, **kwargs):
+ ...
+
+ def predict(self, *args, **kwargs):
+ ...
@runtime_checkable
@@ -149,7 +166,8 @@ class Model(Protocol):
_metric: ClassMap
_ht: dict[str, Any]
- def predict(self, *args, **kwargs) -> Pandas: ...
+ def predict(self, *args, **kwargs) -> Pandas:
+ ...
# Variable types for type hinting ================================== >>
@@ -215,18 +233,9 @@ def predict(self, *args, **kwargs) -> Pandas: ...
ModelsSelector: TypeAlias = ModelSelector | Segment | Sequence[ModelSelector] | None
MetricFunction: TypeAlias = Callable[[Sequence[Scalar], Sequence[Scalar]], Scalar]
MetricConstructor: TypeAlias = (
- str
- | MetricFunction
- | Scorer
- | Sequence[str | MetricFunction | Scorer]
- | None
-)
-MetricSelector: TypeAlias = (
- IntLargerEqualZero
- | str
- | Sequence[IntLargerEqualZero | str]
- | None
+ str | MetricFunction | Scorer | Sequence[str | MetricFunction | Scorer] | None
)
+MetricSelector: TypeAlias = IntLargerEqualZero | str | Sequence[IntLargerEqualZero | str] | None
# Allowed values for BaseTransformer parameter
NJobs: TypeAlias = Annotated[Int, Is[lambda x: x != 0]]
@@ -236,16 +245,10 @@ def predict(self, *args, **kwargs) -> Pandas: ...
Verbose: TypeAlias = Literal[0, 1, 2]
# Data cleaning parameters
-NumericalStrats: TypeAlias = Literal[
- "drop", "mean", "median", "knn", "iterative", "most_frequent"
-]
+NumericalStrats: TypeAlias = Literal["drop", "mean", "median", "knn", "iterative", "most_frequent"]
CategoricalStrats: TypeAlias = Literal["drop", "most_frequent"]
DiscretizerStrats: TypeAlias = Literal["uniform", "quantile", "kmeans", "custom"]
-Bins: TypeAlias = (
- IntLargerOne
- | Sequence[Scalar]
- | dict[str, IntLargerOne | Sequence[Scalar]]
-)
+Bins: TypeAlias = IntLargerOne | Sequence[Scalar] | dict[str, IntLargerOne | Sequence[Scalar]]
NormalizerStrats: TypeAlias = Literal["yeojohnson", "boxcox", "quantile"]
PrunerStrats: TypeAlias = Literal[
"zscore", "iforest", "ee", "lof", "svm", "dbscan", "hdbscan", "optics"
@@ -271,9 +274,7 @@ def predict(self, *args, **kwargs) -> Pandas: ...
# Runner parameters
NItems: TypeAlias = (
- IntLargerEqualZero
- | dict[str, IntLargerEqualZero]
- | Sequence[IntLargerEqualZero]
+ IntLargerEqualZero | dict[str, IntLargerEqualZero] | Sequence[IntLargerEqualZero]
)
# Allowed values for method selection
@@ -281,8 +282,13 @@ def predict(self, *args, **kwargs) -> Pandas: ...
"decision_function", "predict", "predict_log_proba", "predict_proba", "score"
]
PredictionMethodsTS: TypeAlias = Literal[
- "predict", "predict_interval", "predict_proba", "predict_quantiles",
- "predict_residuals", "predict_var", "score"
+ "predict",
+ "predict_interval",
+ "predict_proba",
+ "predict_quantiles",
+ "predict_residuals",
+ "predict_var",
+ "score",
]
# Plotting parameters
@@ -290,12 +296,18 @@ def predict(self, *args, **kwargs) -> Pandas: ...
ParamsSelector: TypeAlias = str | Segment | Sequence[IntLargerEqualZero | str]
TargetSelector: TypeAlias = IntLargerEqualZero | str
TargetsSelector: TypeAlias = TargetSelector | tuple[TargetSelector, ...]
-Kind: TypeAlias = Literal[
- "average", "individual", "average+individual", "individual+average"
-]
+Kind: TypeAlias = Literal["average", "individual", "average+individual", "individual+average"]
Legend: TypeAlias = Literal[
- "upper left", "lower left", "upper right", "lower right", "upper center",
- "lower center", "center left", "center right", "center", "out",
+ "upper left",
+ "lower left",
+ "upper right",
+ "lower right",
+ "upper center",
+ "lower center",
+ "center left",
+ "center right",
+ "center",
+ "out",
]
# Mlflow stages
diff --git a/atom/utils/utils.py b/atom/utils/utils.py
index c76e51a76..07a258d9c 100644
--- a/atom/utils/utils.py
+++ b/atom/utils/utils.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -76,6 +74,7 @@
# Classes ========================================================== >>
+
class NotFittedError(ValueError, AttributeError):
"""Exception called when the instance is not yet fitted.
@@ -124,7 +123,7 @@ def infer_task(self, y: Pandas) -> Task:
return Task.multiclass_multioutput_classification
elif isinstance(y.iloc[0], sequence_t):
return Task.multilabel_classification
- elif y.nunique() == 1:
+ elif y.nunique() == 1: # noqa: PD101
raise ValueError(f"Only found 1 target value: {y.unique()[0]}")
elif y.nunique() == 2:
return Task.binary_classification
@@ -473,7 +472,7 @@ def __init__(self, scorer: Scorer, task: Task):
self.task = task
@property
- def __name__(self) -> str:
+ def __name__(self) -> str: # noqa: A003
"""Return the scorer's name."""
return self.scorer.name
@@ -510,35 +509,17 @@ class Table:
Parameters
----------
headers: sequence
- Name of each column in the table. If an element is a tuple,
- the second element should be the position of the text in the
- cell (left or right).
+ Name of each column in the table.
spaces: sequence
Width of each column. Should have the same length as `headers`.
- default_pos: str, default="right"
- Default position of the text in the cell.
-
"""
- def __init__(
- self,
- headers: Sequence[str | tuple[str, str]],
- spaces: Sequence[Int],
- default_pos: str = "right",
- ):
- self.headers = []
- self.positions = []
- for header in headers:
- if isinstance(header, tuple):
- self.headers.append(header[0])
- self.positions.append(header[1])
- else:
- self.headers.append(header)
- self.positions.append(default_pos)
-
+ def __init__(self, headers: Sequence[str], spaces: Sequence[Int]):
+ self.headers = headers
self.spaces = spaces
+ self.positions = ["left"] + (len(headers) - 1) * ["right"]
@staticmethod
def to_cell(text: Scalar | str, position: str, space: Int) -> str:
@@ -550,7 +531,7 @@ def to_cell(text: Scalar | str, position: str, space: Int) -> str:
Value to add to the cell.
position: str
- Position of text in cell. Choose from: right, left.
+ Position of the text in cell. Choose from: right, left.
space: int
Maximum char length in the cell.
@@ -563,7 +544,7 @@ def to_cell(text: Scalar | str, position: str, space: Int) -> str:
"""
text = str(text)
if len(text) > space:
- text = text[:space - 2] + ".."
+ text = text[: space - 2] + ".."
if position == "right":
return text.rjust(space)
@@ -579,7 +560,7 @@ def print_header(self) -> str:
New row with column names.
"""
- return self.print({k: k for k in self.headers})
+ return self.pprint({k: k for k in self.headers})
def print_line(self) -> str:
"""Print a line with dashes.
@@ -593,9 +574,9 @@ def print_line(self) -> str:
New row with dashes.
"""
- return self.print({k: "-" * s for k, s in zip(self.headers, self.spaces)})
+ return self.pprint({k: "-" * s for k, s in zip(self.headers, self.spaces, strict=True)})
- def print(self, sequence: dict[str, Any] | pd.Series) -> str:
+ def pprint(self, sequence: dict[str, Any] | pd.Series) -> str:
"""Convert a sequence to a nice formatted table row.
Parameters
@@ -610,8 +591,8 @@ def print(self, sequence: dict[str, Any] | pd.Series) -> str:
"""
out = []
- for header, pos, space in zip(self.headers, self.positions, self.spaces):
- out.append(self.to_cell(rnd(sequence.get(header, "---")), pos, space))
+ for h, p, s in zip(self.headers, self.positions, self.spaces, strict=True):
+ out.append(self.to_cell(rnd(sequence.get(h, "---")), p, s))
return "| " + " | ".join(out) + " |"
@@ -671,10 +652,7 @@ def __call__(self, study: Study, trial: FrozenTrial):
if estimator := trial_info["estimator"]:
# Mlflow only accepts params with char length <=250
mlflow.log_params(
- {
- k: v for k, v in estimator.get_params().items()
- if len(str(v)) <= 250
- }
+ {k: v for k, v in estimator.get_params().items() if len(str(v)) <= 250}
)
mlflow.sklearn.log_model(
@@ -688,17 +666,14 @@ def __call__(self, study: Study, trial: FrozenTrial):
)
else:
mlflow.log_params(
- {
- k: v for k, v in trial.params.items()
- if len(str(v)) <= 250
- }
+ {k: v for k, v in trial.params.items() if len(str(v)) <= 250}
)
if self.n_jobs == 1:
# Print overview of trials
trial_info["time_trial"] = time_to_str(trial_info["time_trial"])
trial_info["time_ht"] = time_to_str(trial_info["time_ht"])
- self.T._log(self._table.print(trial_info), 2)
+ self.T._log(self._table.pprint(trial_info), 2)
def create_table(self) -> Table:
"""Create the trial table.
@@ -709,7 +684,7 @@ def create_table(self) -> Table:
Object to display the trial overview.
"""
- headers = [("trial", "left")] + list(self.T._ht["distributions"])
+ headers = ["trial", *self.T._ht["distributions"]]
for m in self.T._metric:
headers.extend([m.name, "best_" + m.name])
headers.extend(["time_trial", "time_ht", "state"])
@@ -727,13 +702,10 @@ def create_table(self) -> Table:
spaces.append(max(7, len(name), options))
spaces.extend(
- [
- max(7, len(column))
- for column in headers[1 + len(self.T._ht["distributions"]):-1]
- ]
+ [max(7, len(column)) for column in headers[1 + len(self.T._ht["distributions"]) : -1]]
)
- return Table(headers, spaces + [8])
+ return Table(headers, [*spaces, 8])
class PlotCallback:
@@ -759,12 +731,8 @@ class PlotCallback:
max_len = 15 # Maximum trials to show at once in the plot
def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
- self.y1: dict[int, deque] = {
- i: deque(maxlen=self.max_len) for i in range(len(metric))
- }
- self.y2: dict[int, deque] = {
- i: deque(maxlen=self.max_len) for i in range(len(metric))
- }
+ self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
+ self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
traces: list[go.Scatter] = []
colors = cycle(aesthetics.palette)
@@ -774,13 +742,13 @@ def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
[
go.Scatter(
mode="lines+markers",
- line=dict(width=aesthetics.line_width, color=color),
- marker=dict(
- symbol="circle",
- size=aesthetics.marker_size,
- line=dict(width=1, color="white"),
- opacity=1,
- ),
+ line={"width": aesthetics.line_width, "color": color},
+ marker={
+ "symbol": "circle",
+ "size": aesthetics.marker_size,
+ "line": {"width": 1, "color": "white"},
+ "opacity": 1,
+ },
name=met,
legendgroup=met,
xaxis="x2",
@@ -788,66 +756,72 @@ def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
),
go.Scatter(
mode="lines+markers",
- line=dict(width=aesthetics.line_width, color=color),
- marker=dict(
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- symbol="circle",
- size=aesthetics.marker_size,
- opacity=1,
- ),
+ line={"width": aesthetics.line_width, "color": color},
+ marker={
+ "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+ "symbol": "circle",
+ "size": aesthetics.marker_size,
+ "opacity": 1,
+ },
name=met,
legendgroup=met,
showlegend=False,
xaxis="x2",
yaxis="y2",
- )
+ ),
]
)
self.figure = go.FigureWidget(
data=traces,
- layout=dict(
- xaxis1=dict(domain=(0, 1), anchor="y1", showticklabels=False),
- yaxis1=dict(
- domain=(0.31, 1.0),
- title=dict(text="Score", font_size=aesthetics.label_fontsize),
- anchor="x1",
- ),
- xaxis2=dict(
- domain=(0, 1),
- title=dict(text="Trial", font_size=aesthetics.label_fontsize),
- anchor="y2",
- ),
- yaxis2=dict(
- domain=(0, 0.29),
- title=dict(text="d", font_size=aesthetics.label_fontsize),
- anchor="x2",
- ),
- title=dict(
- text=f"Hyperparameter tuning for {name}",
- x=0.5,
- y=1,
- pad=dict(t=15, b=15),
- xanchor="center",
- yanchor="top",
- xref="paper",
- font_size=aesthetics.title_fontsize,
- ),
- legend=dict(
- x=0.99,
- y=0.99,
- xanchor="right",
- yanchor="top",
- font_size=aesthetics.label_fontsize,
- bgcolor="rgba(255, 255, 255, 0.5)",
- ),
- hovermode="x unified",
- hoverlabel=dict(font_size=aesthetics.label_fontsize),
- font_size=aesthetics.tick_fontsize,
- margin=dict(l=0, b=0, r=0, t=25 + aesthetics.title_fontsize, pad=0),
- width=900,
- height=800,
- )
+ layout={
+ "xaxis1": {"domain": (0, 1), "anchor": "y1", "showticklabels": False},
+ "yaxis1": {
+ "domain": (0.31, 1.0),
+ "title": {"text": "Score", "font_size": aesthetics.label_fontsize},
+ "anchor": "x1",
+ },
+ "xaxis2": {
+ "domain": (0, 1),
+ "title": {"text": "Trial", "font_size": aesthetics.label_fontsize},
+ "anchor": "y2",
+ },
+ "yaxis2": {
+ "domain": (0, 0.29),
+ "title": {"text": "d", "font_size": aesthetics.label_fontsize},
+ "anchor": "x2",
+ },
+ "title": {
+ "text": f"Hyperparameter tuning for {name}",
+ "x": 0.5,
+ "y": 1,
+ "pad": {"t": 15, "b": 15},
+ "xanchor": "center",
+ "yanchor": "top",
+ "xref": "paper",
+ "font_size": aesthetics.title_fontsize,
+ },
+ "legend": {
+ "x": 0.99,
+ "y": 0.99,
+ "xanchor": "right",
+ "yanchor": "top",
+ "font_size": aesthetics.label_fontsize,
+ "bgcolor": "rgba(255, 255, 255, 0.5)",
+ },
+ "hovermode": "x unified",
+ "hoverlabel": {"font_size": aesthetics.label_fontsize},
+ "font_size": aesthetics.tick_fontsize,
+ "margin": {
+ "l": 0,
+ "b": 0,
+ "r": 0,
+ "t": 25 + aesthetics.title_fontsize,
+ "pad": 0,
+ },
+ "width": 900,
+ "height": 800,
+ },
)
display(self.figure)
@@ -874,9 +848,9 @@ def __call__(self, study: Study, trial: FrozenTrial):
self.y2[i].append(None)
# Update trace data
- self.figure.data[i * 2].x = list(x[:len(self.y1[i])])
+ self.figure.data[i * 2].x = list(x[: len(self.y1[i])])
self.figure.data[i * 2].y = list(self.y1[i])
- self.figure.data[i * 2 + 1].x = list(x[:len(self.y1[i])])
+ self.figure.data[i * 2 + 1].x = list(x[: len(self.y1[i])])
self.figure.data[i * 2 + 1].y = list(self.y2[i])
@@ -947,11 +921,11 @@ def explainer(self) -> Explainer:
"""
# Pass masker as np.array and feature names separately for modin frames
- kwargs = dict(
- masker=self.branch.X_train.to_numpy(),
- feature_names=list(self.branch.features),
- seed=self.random_state,
- )
+ kwargs = {
+ "masker": self.branch.X_train.to_numpy(),
+ "feature_names": list(self.branch.features),
+ "seed": self.random_state,
+ }
try: # Fails when model does not fit standard explainers (e.g., ensembles)
return Explainer(self.estimator, **kwargs)
except TypeError:
@@ -1005,7 +979,7 @@ def get_explanation(
raise ValueError(
"Failed to get shap's explainer for estimator "
f"{self.estimator} with task {self.task}. Exception: {ex}"
- )
+ ) from None
# Remember shap values in the _shap_values attribute
self._shap_values = bk.concat(
@@ -1061,7 +1035,7 @@ def _get_data(self, key: Any) -> Any:
try:
return self.__data[key]
except IndexError:
- raise KeyError(key)
+ raise KeyError(key) from None
else:
for data in self.__data:
if self._conv(getattr(data, self.__key)) == self._conv(key):
@@ -1107,7 +1081,6 @@ def __setitem__(self, key: Any, value: Any):
try:
self.__data = [e if self[key] == e else value for e in self.__data]
except KeyError:
- assert key == getattr(value, self.__key)
self.append(value)
def __delitem__(self, key: Any):
@@ -1134,7 +1107,7 @@ def __reversed__(self) -> Iterator[Any]:
"""Reverse order of the mapping."""
yield from reversed(list(self.__data))
- def __eq__(self, other: Any) -> bool:
+ def __eq__(self, other: object) -> bool:
"""Compare equality of the instances."""
return self.__data == other
@@ -1189,6 +1162,7 @@ def index(self, key: Any) -> Any:
# Functions ======================================================== >>
+
def flt(x: Any) -> Any:
"""Return item from sequence with just that item.
@@ -1532,7 +1506,7 @@ def check_dependency(name: str):
)
-def check_nltk_module(module: str, quiet: bool):
+def check_nltk_module(module: str, *, quiet: bool):
"""Check if a module for the NLTK package is avaialble.
If the module isn't available, it's downloaded.
@@ -1724,14 +1698,12 @@ def get_corpus(df: DataFrame) -> str:
try:
corpus = next(col for col in df.columns if col.lower() == "corpus")
- if not is_bearable(df[corpus].iat[0], (str, Sequence[str])):
- raise TypeError(
- "The corpus should consist of a string or sequence of strings."
- )
+ if not is_bearable(df[corpus].iloc[0], (str, Sequence[str])):
+ raise TypeError("The corpus should consist of a string or sequence of strings.")
else:
return corpus
- except StopIteration:
- raise ValueError("The provided dataset does not contain a column named corpus.")
+ except StopIteration as ex:
+ raise ValueError("The provided dataset does not contain a column named corpus.") from ex
def time_to_str(t: Scalar) -> str:
@@ -1782,7 +1754,7 @@ def n_cols(data: XSelector | YSelector) -> int:
return array.ndim # Can be zero when input is a dict
-def to_pyarrow(column: Series, inverse: bool = False) -> Dtype:
+def to_pyarrow(column: Series, *, inverse: bool = False) -> Dtype:
"""Get the pyarrow dtype corresponding to a series.
Parameters
@@ -1817,7 +1789,8 @@ def to_df(
index: Axes | None = ...,
columns: Axes | None = ...,
dtype: DtypeArg | None = ...,
-) -> None: ...
+) -> None:
+ ...
@overload
@@ -1826,7 +1799,8 @@ def to_df(
index: Axes | None = ...,
columns: Axes | None = ...,
dtype: DtypeArg | None = ...,
-) -> DataFrame: ...
+) -> DataFrame:
+ ...
def to_df(
@@ -1863,7 +1837,7 @@ def to_df(
if not isinstance(data, bk.DataFrame):
# Assign default column names (dict already has column names)
if not isinstance(data, dict | Pandas) and columns is None:
- columns = [f"x{str(i)}" for i in range(n_cols(data))]
+ columns = [f"x{i}" for i in range(n_cols(data))]
if hasattr(data, "to_pandas") and bk.__name__ == "pandas":
# Convert cuML to pandas
@@ -1875,7 +1849,7 @@ def to_df(
columns=columns,
)
else:
- data_c = pd.DataFrame(data, index, columns) # type: ignore
+ data_c = pd.DataFrame(data, index, columns) # type: ignore[arg-type, misc]
else:
data_c = data
@@ -1896,7 +1870,8 @@ def to_series(
index: Axes | None = ...,
name: Hashable | None = ...,
dtype: Dtype | None = ...,
-) -> None: ...
+) -> None:
+ ...
@overload
@@ -1905,7 +1880,8 @@ def to_series(
index: Axes | None = ...,
name: Hashable | None = ...,
dtype: Dtype | None = ...,
-) -> Series: ...
+) -> Series:
+ ...
def to_series(
@@ -1967,7 +1943,8 @@ def to_pandas(
columns: Axes | None = ...,
name: str | None = ...,
dtype: DtypeArg | None = ...,
-) -> None: ...
+) -> None:
+ ...
@overload
@@ -1977,7 +1954,8 @@ def to_pandas(
columns: Axes | None = ...,
name: str | None = ...,
dtype: DtypeArg | None = ...,
-) -> Pandas: ...
+) -> Pandas:
+ ...
def to_pandas(
@@ -2017,13 +1995,14 @@ def to_pandas(
"""
if n_cols(data) == 1:
- return to_series(data, index=index, name=name, dtype=dtype) # type: ignore
+ return to_series(data, index=index, name=name, dtype=dtype) # type: ignore[misc, arg-type]
else:
return to_df(data, index=index, columns=columns, dtype=dtype)
def check_is_fitted(
obj: Any,
+ *,
exception: Bool = True,
attributes: str | Sequence[str] | None = None,
) -> bool:
@@ -2118,34 +2097,34 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer:
"""
if isinstance(metric, str):
- custom_acronyms = dict(
- ap="average_precision",
- ba="balanced_accuracy",
- auc="roc_auc",
- logloss="neg_log_loss",
- ev="explained_variance",
- me="max_error",
- mae="neg_mean_absolute_error",
- mse="neg_mean_squared_error",
- rmse="neg_root_mean_squared_error",
- msle="neg_mean_squared_log_error",
- mape="neg_mean_absolute_percentage_error",
- medae="neg_median_absolute_error",
- poisson="neg_mean_poisson_deviance",
- gamma="neg_mean_gamma_deviance",
- )
+ custom_acronyms = {
+ "ap": "average_precision",
+ "ba": "balanced_accuracy",
+ "auc": "roc_auc",
+ "logloss": "neg_log_loss",
+ "ev": "explained_variance",
+ "me": "max_error",
+ "mae": "neg_mean_absolute_error",
+ "mse": "neg_mean_squared_error",
+ "rmse": "neg_root_mean_squared_error",
+ "msle": "neg_mean_squared_log_error",
+ "mape": "neg_mean_absolute_percentage_error",
+ "medae": "neg_median_absolute_error",
+ "poisson": "neg_mean_poisson_deviance",
+ "gamma": "neg_mean_gamma_deviance",
+ }
- custom_scorers = dict(
- tn=true_negatives,
- fp=false_positives,
- fn=false_negatives,
- tp=true_positives,
- fpr=false_positive_rate,
- tpr=true_positive_rate,
- tnr=true_negative_rate,
- fnr=false_negative_rate,
- mcc=matthews_corrcoef,
- )
+ custom_scorers = {
+ "tn": true_negatives,
+ "fp": false_positives,
+ "fn": false_negatives,
+ "tp": true_positives,
+ "fpr": false_positive_rate,
+ "tpr": true_positive_rate,
+ "tnr": true_negative_rate,
+ "fnr": false_negative_rate,
+ "mcc": matthews_corrcoef,
+ }
metric = metric.lower()
if metric in get_scorer_names():
@@ -2190,6 +2169,7 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer:
# Pipeline functions =============================================== >>
+
def name_cols(
array: TReturn,
original_df: DataFrame,
@@ -2230,12 +2210,13 @@ def name_cols(
lambda c: np.array_equal(
a1=c,
a2=col,
- equal_nan=is_numeric_dtype(c) and np.issubdtype(col.dtype, np.number)),
+ equal_nan=is_numeric_dtype(c) and np.issubdtype(col.dtype, np.number),
+ ),
)
- if any(mask) and mask[mask].index.values[0] not in temp_cols:
+ if any(mask) and mask[mask].index[0] not in temp_cols:
# If the column is equal, use the existing name
- temp_cols.append(mask[mask].index.values[0])
+ temp_cols.append(mask[mask].index[0])
else:
# If the column is new, use a default name
counter = 0
@@ -2285,9 +2266,7 @@ def get_col_order(
# Add all derivative columns: columns that originate from another
# and start with its progenitor name, e.g., one-hot encoded columns
- columns.extend(
- [c for c in df.columns if c.startswith(f"{col}_") and c not in og_columns]
- )
+ columns.extend([c for c in df.columns if c.startswith(f"{col}_") and c not in og_columns])
# Add remaining new columns (non-derivatives)
columns.extend([col for col in df.columns if col not in columns])
@@ -2338,13 +2317,13 @@ def reorder_cols(
# Force new indices on old dataset for merge
try:
original_df.index = df.index
- except ValueError: # Length mismatch
+ except ValueError as ex: # Length mismatch
raise IndexError(
f"Length of values ({len(df)}) does not match length of "
f"index ({len(original_df)}). This usually happens when "
"transformations that drop rows aren't applied on all "
"the columns."
- )
+ ) from ex
columns = get_col_order(df, original_df.columns.tolist(), col_names)
@@ -2631,6 +2610,7 @@ def fit_transform_one(
# Patches ========================================================== >>
+
def fit_and_score(*args, **kwargs) -> dict[str, Any]:
"""Wrap sklearn's _fit_and_score function.
@@ -2668,6 +2648,7 @@ def wrapper(*args, **kwargs) -> Float | dict[str, Float]:
# Decorators ======================================================= >>
+
def cache(f: Callable) -> Callable:
"""Cache method utility.
@@ -2775,7 +2756,7 @@ def decorator(f: Callable) -> Callable:
def crash(
f: Callable,
- cache: dict[str, Exception | None] = {"last_exception": None},
+ cache: dict[str, Exception | None] = {"last_exception": None}, # noqa: B006
) -> Callable:
"""Save program crashes to log file.
@@ -2856,6 +2837,7 @@ def wrapper(
# Custom scorers =================================================== >>
+
def true_negatives(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Int:
"""Outcome where the model correctly predicts the negative class."""
return confusion_matrix(y_true, y_pred).ravel()[0]
diff --git a/docs_sources/contributing.md b/docs_sources/contributing.md
index 528ce8709..6894372fe 100644
--- a/docs_sources/contributing.md
+++ b/docs_sources/contributing.md
@@ -77,7 +77,7 @@ maybe an issue for your problem already exists, and the discussion
might inform you of workarounds readily available.
We want to fix all the issues as soon as possible, but before fixing a
-bug we need to reproduce and confirm it. In order to reproduce bugs we
+bug, we need to reproduce and confirm it. In order to reproduce bugs, we
will systematically ask you to provide a minimal reproduction scenario
using the custom issue template.
@@ -90,15 +90,14 @@ and accept your changes.
* Update the documentation so all of your changes are reflected there.
* Adhere to [PEP 8](https://peps.python.org/pep-0008/) standards.
-* Use a maximum of 91 characters per line. Try to keep docstrings below
+* Use a maximum of 99 characters per line. Try to keep docstrings below
74 characters.
* Update the project unit tests to test your code changes as thoroughly
as possible.
* Make sure that your code is properly commented with docstrings and
comments explaining your rationale behind non-obvious coding practices.
* Run [isort](https://pycqa.github.io/isort/): `isort atom tests`.
-* Run [flake8](https://github.com/pycqa/flake8): `flake8 --show-source --statistics atom tests`.
-* Run [pydocstyle](https://www.pydocstyle.org/en/stable/): `pydocstyle atom tests`.
+* Run [ruff](https://docs.astral.sh/ruff/): ` ruff check --fix atom tests`.
* Run [mypy](https://www.mypy-lang.org/): `mypy atom tests`.
If your contribution requires a new library dependency:
diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md
index 1c20d94a8..9ddf6b39f 100644
--- a/docs_sources/dependencies.md
+++ b/docs_sources/dependencies.md
@@ -43,6 +43,7 @@ packages are necessary for its correct functioning.
* **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2)
* **[plotly](https://plotly.com/python/)** (>=5.15.0)
* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.7.1)
+* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0)
* **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.3.1)
* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1)
* **[scipy](https://www.scipy.org/)** (>=1.10.1)
@@ -74,17 +75,17 @@ additional libraries. You can install all the optional dependencies using
The development dependencies are not installed with the package, and are
not required for any of its functionalities. These libraries are only
necessary to [contribute][contributing] to the project. Install them
-running `pdm install --dev` (don't forget to install [pdm](https://pdm-project.org/latest/)
-with `pip install -U pdm`).
+running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/latest/) with
+`pip install -U pdm`).
**Linting**
* **[isort](https://pycqa.github.io/isort/)** (>=5.12.0)
-* **[flake8](https://github.com/pycqa/flake8)** (>=6.0.0)
-* **[flake8-pyproject](https://github.com/john-hen/Flake8-pyproject)** (>=1.2.3)
-* **[pydocstyle](https://www.pydocstyle.org/en/stable/)** (>=6.3.0)
* **[mypy](https://www.mypy-lang.org/)** (>=1.6.1)
* **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.1.1.230928)
+* **[pre-commit](https://pre-commit.com/)** (>=3.5.0)
+* **[pre-commit-hooks](https://github.com/pre-commit/pre-commit-hooks)** (>=4.5.0)
+* **[ruff](https://docs.astral.sh/ruff/)** (>=0.1.7)
* **[types-requests](https://github.com/python/typeshed)** (>=2.31.0.10)
**Testing**
@@ -92,6 +93,7 @@ with `pip install -U pdm`).
* **[nbmake](https://github.com/treebeardtech/nbmake)** (>=1.4.1)
* **[pytest](https://docs.pytest.org/en/latest/)** (>=7.2.1)
* **[pytest-cov](https://pytest-cov.readthedocs.io/en/latest/)** (>=4.0.0)
+* **[pytest-mock](https://github.com/pytest-dev/pytest-mock/)** (>=3.12.0)
* **[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)** (>=3.2.0)
* **[scikeras](https://github.com/adriangb/scikeras)** (>=0.11.0)
* **[tensorflow](https://www.tensorflow.org/learn)** (>=2.13.0)
diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py
index cf230829a..4a88f2be0 100644
--- a/docs_sources/scripts/autodocs.py
+++ b/docs_sources/scripts/autodocs.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -11,11 +9,17 @@
import importlib
import json
-import os
from dataclasses import dataclass
from inspect import (
- Parameter, getdoc, getmembers, getsourcelines, isclass, isfunction,
- ismethod, isroutine, signature,
+ Parameter,
+ getdoc,
+ getmembers,
+ getsourcelines,
+ isclass,
+ isfunction,
+ ismethod,
+ isroutine,
+ signature,
)
from typing import Any, Optional
from collections.abc import Callable
@@ -24,7 +28,7 @@
import yaml
from mkdocs.config.defaults import MkDocsConfig
-from atom.utils.utils import Goal, Task
+from atom.utils.utils import Goal
# Variables ======================================================== >>
@@ -245,6 +249,7 @@
# Classes ========================================================== >>
+
@dataclass
class DummyTrainer:
"""Dummy trainer class to call model instances."""
@@ -304,7 +309,7 @@ class AutoDocs:
r"\Z",
)
- def __init__(self, obj: Callable, method: Optional[Callable] = None):
+ def __init__(self, obj: Callable, method: Callable | None = None):
if method:
self.obj = getattr(obj, method)
self.method = method
@@ -419,9 +424,9 @@ def get_signature(self) -> str:
for k, v in params.items():
if k not in ("cls", "self") and not k.startswith("_"):
if v.default == Parameter.empty:
- if '**' in str(v):
+ if "**" in str(v):
sign.append(f"**{k}") # Add ** to kwargs
- elif '*' in str(v):
+ elif "*" in str(v):
sign.append(f"*{k}") # Add * to args
else:
sign.append(k)
@@ -433,7 +438,7 @@ def get_signature(self) -> str:
sign = f"({', '.join(sign)})"
- f = self.obj.__module__.replace('.', '/') # Module and filename sep by /
+ f = self.obj.__module__.replace(".", "/") # Module and filename sep by /
if "atom" in self.obj.__module__:
url = f"https://github.com/tvdboom/ATOM/blob/master/{f}.py"
elif "sklearn" in self.obj.__module__:
@@ -442,7 +447,7 @@ def get_signature(self) -> str:
url = ""
anchor = f""
- module = self.obj.__module__ + '.' if obj != "method" else ""
+ module = self.obj.__module__ + "." if obj != "method" else ""
obj = f"{obj}"
name = f"{self.obj.__name__}"
if url:
@@ -479,7 +484,7 @@ def get_description(self) -> str:
"""
pattern = f".*?(?={'|'.join(self.blocks)})"
- match = re.match(pattern, self.doc[len(self.get_summary()):], re.S)
+ match = re.match(pattern, self.doc[len(self.get_summary()) :], re.S)
return match.group() if match else ""
def get_see_also(self) -> str:
@@ -555,7 +560,8 @@ def get_table(self, blocks: list) -> str:
attrs = include
else:
attrs = [
- m for m, _ in getmembers(self.obj, lambda x: not isroutine(x))
+ m
+ for m, _ in getmembers(self.obj, lambda x: not isroutine(x))
if not m.startswith("_")
and not any(re.fullmatch(p, m) for p in config.get("exclude", []))
]
@@ -610,7 +616,7 @@ def get_table(self, blocks: list) -> str:
pass
# Get the body corresponding to the header
- pattern = f"(?<={re.escape(header)}\n).*?(?=\n\w|\n\*|\n\[|\Z)"
+ pattern = f"(?<={re.escape(header)}\n).*?(?=\n\\w|\n\\*|\n\\[|\\Z)"
body = re.search(pattern, match, re.S | re.M).group()
header = header.replace("*", r"\*") # Use literal * for args/kwargs
@@ -731,7 +737,8 @@ def get_methods(self, config: dict) -> str:
methods = include
else:
methods = [
- m for m, _ in getmembers(self.obj, predicate=predicate)
+ m
+ for m, _ in getmembers(self.obj, predicate=predicate)
if not m.startswith("_") and not any(re.fullmatch(p, m) for p in exclude)
]
@@ -768,6 +775,7 @@ def get_methods(self, config: dict) -> str:
# Functions ======================================================== >>
+
def render(markdown: str, **kwargs) -> str:
"""Render the markdown page.
@@ -792,14 +800,14 @@ def render(markdown: str, **kwargs) -> str:
"""
autodocs = None
- while match := re.search("(:: )(\w.*?)(?=::|\n\n|\Z)", markdown, re.S):
+ while match := re.search("(:: )(\\w.*?)(?=::|\n\n|\\Z)", markdown, re.S):
command = yaml.safe_load(match.group(2))
# Commands should always be dicts with the configuration as a list in values
if isinstance(command, str):
if ":" in command:
autodocs = AutoDocs.get_obj(command)
- markdown = markdown[:match.start()] + markdown[match.end():]
+ markdown = markdown[: match.start()] + markdown[match.end() :]
continue
else:
command = {command: None} # Has no options specified
@@ -833,7 +841,7 @@ def render(markdown: str, **kwargs) -> str:
else:
text = ""
- markdown = markdown[:match.start()] + text + markdown[match.end():]
+ markdown = markdown[: match.start()] + text + markdown[match.end() :]
# Change the custom autorefs now to use [self-...][]
markdown = custom_autorefs(markdown, autodocs)
@@ -930,7 +938,7 @@ def clean_search(config: MkDocsConfig):
Object containing the search index.
"""
- with open(f"{config.data['site_dir']}/search/search_index.json", 'r') as f:
+ with open(f"{config.data['site_dir']}/search/search_index.json") as f:
search = json.load(f)
for elem in search["docs"]:
@@ -938,9 +946,11 @@ def clean_search(config: MkDocsConfig):
elem["text"] = re.sub(r"window\.PLOTLYENV.*?\)\s*?}\s*?", "", elem["text"], flags=re.S)
# Remove mkdocs-jupyter css
- elem["text"] = re.sub(r"\(function \(global, factory.*?(?=Example:)", "", elem["text"], flags=re.S)
+ elem["text"] = re.sub(
+ r"\(function \(global, factory.*?(?=Example:)", "", elem["text"], flags=re.S
+ )
- with open(f"{config.data['site_dir']}/search/search_index.json", 'w') as f:
+ with open(f"{config.data['site_dir']}/search/search_index.json", "w") as f:
json.dump(search, f)
@@ -981,7 +991,7 @@ def custom_autorefs(markdown: str, autodocs: Optional[AutoDocs] = None) -> str:
text = match.group()
if not link:
# Only adapt when has form [anchor][]
- link = anchor.replace(' ', '-').replace('.', '').lower()
+ link = anchor.replace(" ", "-").replace(".", "").lower()
text = f"[{anchor}][{link}]"
if link in CUSTOM_URLS:
# Replace keyword with custom url
@@ -990,7 +1000,7 @@ def custom_autorefs(markdown: str, autodocs: Optional[AutoDocs] = None) -> str:
link = link.replace("self", autodocs.obj.__name__.lower())
text = f"[{anchor}][{link}]"
- result += markdown[start:match.start()] + text
+ result += markdown[start : match.start()] + text
start = match.end()
return result + markdown[start:]
diff --git a/docs_sources/scripts/autorun.py b/docs_sources/scripts/autorun.py
index df31bc4a0..173b12f8f 100644
--- a/docs_sources/scripts/autorun.py
+++ b/docs_sources/scripts/autorun.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,6 @@
import ast
import os
-import re
import shutil
import sys
from base64 import b64encode
@@ -123,7 +120,7 @@ def latest_file() -> str | None:
end_line = node.end_lineno
# Get complete code block
- block = lines[node.lineno - 1: end_line]
+ block = lines[node.lineno - 1 : end_line]
if "# hide" not in line:
output[-1].extend([draw(code) for code in block])
@@ -154,12 +151,14 @@ def latest_file() -> str | None:
output.append([]) # Add new code block
if (f := latest_file()).endswith(".html"):
- with open(f"{DIR_EXAMPLES}{f}", 'r', encoding="utf-8") as file:
+ with open(f"{DIR_EXAMPLES}{f}", encoding="utf-8") as file:
figures.append(file.read())
else:
- with open(f"{DIR_EXAMPLES}{f}", 'rb') as file:
+ with open(f"{DIR_EXAMPLES}{f}", "rb") as file:
img = b64encode(file.read()).decode("utf-8")
- figures.append(f"")
+ figures.append(
+ f""
+ )
elif i > end_line:
output[-1].append(draw(line))
@@ -224,7 +223,7 @@ def to_html(code: list[str]) -> str:
language=language,
md=md,
options=options,
- **kwargs
+ **kwargs,
)
# First line of markdown page
@@ -243,7 +242,7 @@ def to_html(code: list[str]) -> str:
render.append(source)
- except Exception as e:
- raise SuperFencesException(f"Exception raised running code:\n{src}") from e
+ except Exception as ex:
+ raise SuperFencesException(f"Exception raised running code:\n{src}") from ex
return "
".join(render)
diff --git a/pyproject.toml b/pyproject.toml
index 149cc75bb..ed1f7081e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
"pandas[parquet]>=2.1.2",
"plotly>=5.15.0",
"ray[serve]>=2.7.1",
+ "requests>=2.31.0",
"scikit-learn>=1.3.1",
"scikit-learn-intelex>=2023.2.1; platform_machine == 'x86_64' or platform_machine == 'AMD64'",
"scipy>=1.10.1",
@@ -64,16 +65,17 @@ full = [
dev = [
# Linting
"isort>=5.12.0",
- "flake8>=6.0.0",
- "flake8-pyproject>=1.2.3", # To configure flake8 with pyproject.toml
- "pydocstyle>=6.3.0",
"mypy>=1.6.1",
"pandas_stubs>=2.1.1.230928",
+ "pre-commit>=3.5.0",
+ "pre-commit-hooks>=4.5.0",
+ "ruff>=0.1.7",
"types-requests>=2.31.0.10",
# Testing
"nbmake>=1.4.1", # To test example notebooks
"pytest>=7.2.1",
"pytest-cov>=4.0.0",
+ "pytest-mock>=3.12.0",
"pytest-xdist>=3.2.0",
"scikeras>=0.11.0",
"tensorflow>=2.13.0",
@@ -108,18 +110,6 @@ testpaths = "tests/"
python_files = "*.py"
python_functions = "test_*"
-[tool.flake8]
-max-line-length = 91
-ignore = [
- "W605", # Invalid escape sequence: \s, \w, etc... (removed because of docstrings)
- "W503", # Line break before binary operator
- "E731", # Assign a lambda expression
- "E704", # Multiple statements on one line (removed because of mypy overloads)
-]
-per-file-ignores = [
- "__init__.py: F401", # Imported but unused
-]
-
[tool.isort]
skip_gitignore = true
multi_line_output = 5
@@ -129,9 +119,76 @@ include_trailing_comma = true
ignore_comments = true
remove_redundant_aliases = true
-[tool.pydocstyle]
+[tool.ruff]
+line-length = 99
+indent-width = 4
+show-fixes = true
+extend-include = ["*.ipynb"]
+select = [
+ "F", # pyflakes
+ "E", # pycodestyle errors
+ "W", # pycodestyle warnings
+ "D", # Missing docstring in public function
+ "UP", # pyupgrade
+ "YTT", # flake8-2020
+ "BLE", # blind-except
+ "FBT", # boolean-trap
+ "B", # bugbear
+ "A", # builtins
+ "COM", # commas
+ "C4", # flake8-comprehensions
+ "T10", # debugger
+ "ISC", # implici-str-concat
+ "ICN", # import-conventions
+ "G", # flake8-logging-format
+ "INP", # no-pep420
+ "PIE", # flake8-pie
+ "T20", # print
+ "PYI", # pyi
+ "PT", # flake8-pytest-style
+ "Q", # quotes
+ "RSE", # raise
+ "TID", # tidy-imports
+ "ARG", # flake8-unused-arguments
+ "ERA", # commented-code
+ "PD", # pandas-vet
+ "PGH", # pygrep
+ "FLY", # flynt
+ "NPY", # numpy-specific-rules
+ "PERF", # performance checks
+ "RUF", # ruff
+]
+ignore = [
+ "E731", # Do not assign a `lambda` expression, use a `def`
+ "COM812", # Trailing comma missing
+ "PD901", # Avoid using the generic variable name `df` for DataFrames
+ "PD011", # Use `.to_numpy()` instead of `.values`
+ "PD009", # Use `.iloc` instead of `.iat`
+ "ARG002", # Unused method argument
+ "B023", # Function definition does not bind loop variable
+]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"] # Unused imports
+"tests/*" = [
+ "T20", # print found
+ "S101", # assert called
+ "PT011", # error is too broad
+]
+"baserunner.py" = ["ERA001"] # Commented-out code
+"types.py" = [
+ "D105", # Missing docstring in magic method
+ "D102", # Missing docstring in public method
+]
+
+[tool.ruff.pydocstyle]
convention = "numpy"
-match = "(?!test_|types).*.py" # Skip files named test_* and types.py
[tool.mypy]
ignore_missing_imports = true
diff --git a/tests/__init__.py b/tests/__init__.py
index 4d61540ba..27fac305c 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/tests/conftest.py b/tests/conftest.py
index 832d61b74..7f9c2b28a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,14 +7,11 @@
from __future__ import annotations
-from pathlib import Path
-from typing import Any
-from unittest.mock import patch
+from typing import TYPE_CHECKING, Any
import numpy as np
import pandas as pd
import pytest
-from _pytest.monkeypatch import MonkeyPatch
from sklearn.base import BaseEstimator
from sklearn.datasets import (
load_breast_cancer, load_diabetes, load_wine,
@@ -27,10 +22,17 @@
from sktime.datasets import load_airline, load_longley
from sktime.split import temporal_train_test_split
-from atom.utils.types import DataFrame, Pandas, Sequence, XSelector
from atom.utils.utils import merge, n_cols, to_df, to_pandas
+if TYPE_CHECKING:
+ from pathlib import Path
+
+ from _pytest.monkeypatch import MonkeyPatch
+
+ from atom.utils.types import DataFrame, Pandas, Sequence, XSelector
+
+
class DummyTransformer(BaseEstimator):
"""Transformer class for testing name keeping of arrays.
@@ -75,9 +77,11 @@ def transform(self, X: DataFrame) -> np.ndarray:
@pytest.fixture(autouse=True)
-def change_current_dir(tmp_path: Path, monkeypatch: MonkeyPatch):
+def _change_current_dir(tmp_path: Path, monkeypatch: MonkeyPatch):
"""Change the directory of the test to a temporary dir.
+ Avoid saving test files to the working directory.
+
Parameters
----------
tmp_path: pathlib.Path
@@ -91,10 +95,20 @@ def change_current_dir(tmp_path: Path, monkeypatch: MonkeyPatch):
@pytest.fixture(autouse=True)
-def mock_mlflow_log_model():
- """Mock mlflow's log_model function."""
- with patch("mlflow.sklearn.log_model"):
- yield
+def _mock_mlflow_log_model(mocker):
+ """Mock mlflow's log_model function.
+
+ This is by far mlflow's slowest method. Mocking it reduces the
+ average test time by several seconds.
+
+ """
+ mocker.patch("mlflow.sklearn.log_model")
+
+
+@pytest.fixture()
+def random():
+ """Return numpy's default random number generator."""
+ return np.random.default_rng()
def get_train_test(
@@ -147,7 +161,7 @@ def get_train_test(
"feature 1": pd.arrays.SparseArray([1, 0, 0, 0, 0, 0, 1, 0, 1, 0]),
"feature 2": pd.arrays.SparseArray([1, 0, 1, 0, 0, 1, 0, 0, 1, 0]),
"feature 3": pd.arrays.SparseArray([1, 1, 1, 0, 0, 0, 1, 0, 0, 0]),
- }
+ },
)
# Text data
diff --git a/tests/test_api.py b/tests/test_api.py
index bfd25f90b..88a64efe2 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
diff --git a/tests/test_atom.py b/tests/test_atom.py
index 9c40a3a5f..aef7c8725 100644
--- a/tests/test_atom.py
+++ b/tests/test_atom.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -8,7 +6,7 @@
"""
import glob
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
@@ -41,6 +39,7 @@
# Test __init__ ==================================================== >>
+
def test_task_assignment():
"""Assert that the correct task is assigned."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -82,6 +81,7 @@ def test_backend_with_n_jobs_1():
# Test magic methods =============================================== >>
+
def test_repr():
"""Assert that the __repr__ method visualizes the pipeline(s)."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -96,11 +96,12 @@ def test_iter():
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.clean()
atom.impute()
- assert [item for item in atom][1] == atom.pipeline[1]
+ assert list(atom) == list(atom.pipeline.named_steps.values())
# Test utility properties =========================================== >>
+
def test_branch():
"""Assert that we can get the current branch."""
atom = ATOMClassifier(X10, y10, random_state=1)
@@ -268,12 +269,13 @@ def test_unavailable_regression_properties():
# Test utility methods ============================================= >>
+
@pytest.mark.parametrize("distributions", [None, "norm", ["norm", "pearson3"]])
def test_distribution(distributions):
"""Assert that the distribution method and file are created."""
atom = ATOMClassifier(X10_str, y10, random_state=1)
- df = atom.distribution(distributions=distributions, columns=(0, 1))
- assert isinstance(df, pd.DataFrame)
+ dist = atom.distribution(distributions=distributions, columns=(0, 1))
+ assert isinstance(dist, pd.DataFrame)
@patch("sweetviz.analyze")
@@ -366,7 +368,8 @@ def test_reset():
atom.encode()
atom.run("LR", errors="raise")
atom.reset(hard=True)
- assert not atom.models and len(atom._branches) == 1
+ assert not atom.models
+ assert len(atom._branches) == 1
assert atom["x2"].dtype.name == "object" # Is reset back to str
@@ -483,6 +486,7 @@ def test_transform_not_train_only():
# Test base transformers =========================================== >>
+
def test_add_after_model():
"""Assert that an error is raised when adding after training a model."""
atom = ATOMClassifier(X_bin, y_bin, verbose=1, random_state=1)
@@ -518,11 +522,13 @@ def test_add_train_only():
"""Assert that atom accepts transformers for the train set only."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.add(StandardScaler(), train_only=True)
- assert check_scaling(atom.X_train) and not check_scaling(atom.X_test)
+ assert check_scaling(atom.X_train)
+ assert not check_scaling(atom.X_test)
len_train, len_test = len(atom.train), len(atom.test)
atom.add(Pruner(), train_only=True)
- assert len(atom.train) != len_train and len(atom.test) == len_test
+ assert len(atom.train) != len_train
+ assert len(atom.test) == len_test
def test_add_complete_dataset():
@@ -692,7 +698,7 @@ def test_add_pipeline():
steps=[
("scaler", StandardScaler()),
("sfm", SelectFromModel(RandomForestClassifier())),
- ]
+ ],
)
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.add(pipeline)
@@ -714,11 +720,12 @@ def test_apply():
"""Assert that a function can be applied to the dataset."""
atom = ATOMClassifier(X_bin, y_bin, shuffle=False, random_state=1)
atom.apply(np.exp, columns=0)
- assert atom.iat[0, 0] == np.exp(X_bin.iat[0, 0])
+ assert atom.iloc[0, 0] == np.exp(X_bin.iloc[0, 0])
# Test data cleaning transformers =================================== >>
+
def test_balance_wrong_task():
"""Assert that an error is raised for regression and multioutput tasks."""
# For regression tasks
@@ -782,7 +789,8 @@ def test_prune():
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
len_train, len_test = len(atom.train), len(atom.test)
atom.prune(strategy="lof")
- assert len(atom.train) != len_train and len(atom.test) == len_test
+ assert len(atom.train) != len_train
+ assert len(atom.test) == len_test
def test_scale():
@@ -794,6 +802,7 @@ def test_scale():
# Test nlp transformers ============================================ >>
+
def test_textclean():
"""Assert that the textclean method cleans the corpus."""
atom = ATOMClassifier(X_text, y10, shuffle=False, random_state=1)
@@ -825,6 +834,7 @@ def test_vectorize():
# Test feature engineering transformers ============================ >>
+
def test_feature_extraction():
"""Assert that the feature_extraction method creates datetime features."""
atom = ATOMClassifier(X10_dt, y10, random_state=1)
@@ -872,8 +882,8 @@ def test_default_solver_from_task():
assert atom.pipeline[0].rfe_.estimator_.__class__.__name__ == "DecisionTreeRegressor"
-@patch("atom.feature_engineering.SequentialFeatureSelector")
-def test_default_scoring(cls):
+@patch("atom.feature_engineering.SequentialFeatureSelector", MagicMock())
+def test_default_scoring():
"""Assert that the scoring is atom's metric when exists."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run("lr", metric="recall")
@@ -884,6 +894,7 @@ def test_default_scoring(cls):
# Test training methods ============================================ >>
+
def test_non_numerical_target_column():
"""Assert that an error is raised when the target column is categorical."""
atom = ATOMClassifier(X10, y10_str, random_state=1)
diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py
index 26b2a820a..5f71e9e75 100644
--- a/tests/test_basemodel.py
+++ b/tests/test_basemodel.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -38,11 +36,13 @@
# Test magic methods ================================== >>
+
def test_scaler():
"""Assert that a scaler is made for models that need scaling."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run(["LGB", "LDA"], est_params={"LGB": {"n_estimators": 5}})
- assert atom.lgb.scaler and not atom.lda.scaler
+ assert atom.lgb.scaler
+ assert not atom.lda.scaler
def test_str():
@@ -82,6 +82,7 @@ def test_getitem():
# Test training ==================================================== >>
+
def test_est_params_invalid_param():
"""Assert that invalid parameters in est_params are caught."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -162,7 +163,7 @@ def test_custom_distributions_meta_estimators():
"distributions": {
"order": CategoricalDistribution([(0, 1, 2, 3), (1, 0, 3, 2)]),
"base_estimator__solver": CategoricalDistribution(["lbfgs", "newton-cg"]),
- }
+ },
},
)
@@ -212,7 +213,7 @@ def test_empty_study(func):
func.return_value = [] # No successful trials
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
- atom.run(models="tree", n_trials=1, errors="raise")
+ atom.run(models="tree", n_trials=1)
assert not hasattr(atom.tree, "study")
@@ -220,7 +221,7 @@ def test_ht_with_pipeline():
"""Assert that the hyperparameter tuning works with a transformer pipeline."""
atom = ATOMClassifier(X10_str, y10, random_state=1)
atom.encode()
- atom.run("lr", n_trials=1, errors='raise')
+ atom.run("lr", n_trials=1)
assert hasattr(atom.lr, "trials")
@@ -243,13 +244,13 @@ def test_ht_with_pruning():
atom = ATOMClassifier(X_bin, y=y_bin, random_state=1)
atom.run(
models="SGD",
- n_trials=10,
+ n_trials=7,
ht_params={
"distributions": {"max_iter": IntDistribution(5, 15)},
"pruner": PatientPruner(None, patience=1),
},
)
- assert "PRUNED" in atom.sgd.trials["state"].values
+ assert "PRUNED" in atom.sgd.trials["state"].unique()
def test_sample_weight_fit():
@@ -279,11 +280,11 @@ def test_skip_duplicate_calls():
def test_trials_stored_correctly():
- """Assert that the trials attribute has the same params as the trial object."""
+ """Assert that the `trials` attribute has the same params as the trial object."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run("lr", n_trials=3, ht_params={"distributions": ["penalty", "C"]})
- assert atom.lr.trials.at[2, "penalty"] == atom.lr.study.trials[2].params["penalty"]
- assert atom.lr.trials.at[2, "C"] == atom.lr.study.trials[2].params["C"]
+ assert atom.lr.trials.loc[2, "penalty"] == atom.lr.study.trials[2].params["penalty"]
+ assert atom.lr.trials.loc[2, "C"] == atom.lr.study.trials[2].params["C"]
@patch("mlflow.log_params")
@@ -291,7 +292,7 @@ def test_nested_runs_to_mlflow(mlflow):
"""Assert that the trials are logged to mlflow as nested runs."""
atom = ATOMClassifier(X_bin, y_bin, experiment="test", random_state=1)
atom.log_ht = True
- atom.run("Tree", n_trials=1, errors='raise')
+ atom.run("Tree", n_trials=1)
assert mlflow.call_count == 2 # n_trials + fit
@@ -365,6 +366,7 @@ def test_continued_bootstrapping():
# Test utility properties ========================================== >>
+
def test_name_property():
"""Assert that the name property can be set."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -500,6 +502,7 @@ def test_results_property():
# Test data properties ============================================= >>
+
def test_pipeline_property():
"""Assert that the pipeline property returns the scaler as well."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -606,6 +609,7 @@ def test_all_property():
# Test prediction methods ========================================== >>
+
def test_predictions_from_index():
"""Assert that predictions can be made from data indices."""
atom = ATOMClassifier(X_idx, y_idx, index=True, holdout_size=0.1, random_state=1)
@@ -681,6 +685,7 @@ def test_score_with_sample_weight():
# Test utility methods ============================================= >>
+
def test_calibrate_invalid_task():
"""Assert than an error is raised when task="regression"."""
atom = ATOMRegressor(X_reg, y_reg, random_state=1)
@@ -928,7 +933,8 @@ def test_serve():
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run("MNB")
atom.mnb.serve()
- assert "200" in str(requests.get("http://127.0.0.1:8000/", json=X_bin.to_json()))
+ response = requests.get("http://127.0.0.1:8000/", json=X_bin.to_json(), timeout=5)
+ assert response.status_code == 200
serve.shutdown()
@@ -958,4 +964,4 @@ def test_transform():
atom.run("LR")
X = atom.lr.transform(X10_str)
assert len(X.columns) > 3 # Data is one-hot encoded
- assert all(-3 <= v <= 3 for v in X.values.ravel()) # Data is scaled
+ assert all(-3 <= v <= 3 for v in X.to_numpy().ravel()) # Data is scaled
diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py
index 7f8885084..4731ff821 100644
--- a/tests/test_baserunner.py
+++ b/tests/test_baserunner.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,7 @@
import glob
import sys
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
@@ -32,6 +30,7 @@
# Test magic methods =============================================== >>
+
def test_getstate_and_setstate():
"""Assert that versions are checked and a warning raised."""
atom = ATOMClassifier(X_bin, y_bin, warnings=True)
@@ -85,11 +84,11 @@ def test_getattr_invalid():
def test_setattr_to_branch():
"""Assert that branch properties can be set."""
new_dataset = merge(X_bin, y_bin)
- new_dataset.iat[0, 3] = 4 # Change one value
+ new_dataset.iloc[0, 3] = 4 # Change one value
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.dataset = new_dataset
- assert atom.dataset.iat[0, 3] == 4 # Check the value is changed
+ assert atom.dataset.iloc[0, 3] == 4 # Check the value is changed
def test_setattr_normal():
@@ -175,6 +174,7 @@ def test_getitem_list():
# Test utility properties ========================================== >>
+
def test_branch_property():
"""Assert that the branch property returns the current branch."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -273,6 +273,7 @@ def test_results_property_train_sizing():
# Test _set_index ================================================== >>
+
def test_index_is_true():
"""Assert that the indices are left as is when index=True."""
atom = ATOMClassifier(X_idx, y_idx, index=True, shuffle=False, random_state=1)
@@ -360,6 +361,7 @@ def test_duplicate_indices():
# Test _get_stratify_columns======================================== >>
+
@pytest.mark.parametrize("stratify", [True, -1, "target", [-1]])
def test_stratify_options(stratify):
"""Assert that the data can be stratified among data sets."""
@@ -391,6 +393,7 @@ def test_stratify_invalid_column_str():
# Test _get_data =================================================== >>
+
def test_input_is_y_without_arrays():
"""Assert that input y through parameter works."""
atom = ATOMForecaster(y=y_fc, random_state=1)
@@ -687,6 +690,7 @@ def test_invalid_index_forecast():
# Test utility methods ============================================= >>
+
def test_get_models_is_None():
"""Assert that all models are returned by default."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -804,7 +808,8 @@ def test_delete_default():
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run(["LR", "LDA"])
atom.delete() # All models
- assert not (atom.models or atom.metric)
+ assert not atom.models
+ assert not atom.metric
assert atom.results.empty
@@ -941,9 +946,9 @@ def test_file_is_saved():
assert glob.glob("ATOMClassifier.pkl")
-@patch("atom.baserunner.pickle")
-def test_save_data_false(cls):
- """Assert that the dataset is restored after saving with save_data=False"""
+@patch("atom.baserunner.pickle", MagicMock())
+def test_save_data_false():
+ """Assert that the dataset is restored after saving with save_data=False."""
atom = ATOMClassifier(X_bin, y_bin, holdout_size=0.1, random_state=1)
atom.save(filename="atom", save_data=False)
assert atom.dataset is not None # Dataset is restored after saving
@@ -1001,7 +1006,8 @@ def test_stacking_different_name():
atom.run(["LR", "LGB"], est_params={"LGB": {"n_estimators": 5}})
atom.stacking(name="stack_1")
atom.stacking(name="_2")
- assert hasattr(atom, "Stack_1") and hasattr(atom, "Stack_2")
+ assert hasattr(atom, "Stack_1")
+ assert hasattr(atom, "Stack_2")
def test_stacking_unknown_predefined_final_estimator():
diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py
index 233e808f2..b9c692285 100644
--- a/tests/test_basetrainer.py
+++ b/tests/test_basetrainer.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -7,7 +5,7 @@
"""
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
import mlflow
import pytest
@@ -29,6 +27,7 @@
# Test _prepare_parameters =========================================== >>
+
def test_model_is_predefined():
"""Assert that predefined models are accepted."""
trainer = DirectClassifier("LR", random_state=1)
@@ -317,6 +316,7 @@ def test_ht_params_invalid_key():
# Test _core_iteration ============================================= >>
+
def test_sequence_parameters():
"""Assert that every model get his corresponding parameters."""
trainer = DirectClassifier(
@@ -378,9 +378,9 @@ def test_errors_keep():
assert trainer._models == [trainer.lda]
-@patch("atom.basetransformer.ray")
-@patch("atom.basetrainer.ray")
-def test_parallel_with_ray(_, __):
+@patch("atom.basetransformer.ray", MagicMock())
+@patch("atom.basetrainer.ray", MagicMock())
+def test_parallel_with_ray():
"""Assert that parallel runs successfully with ray backend."""
trainer = DirectClassifier(
models=["LR", "LDA"],
@@ -395,8 +395,8 @@ def test_parallel_with_ray(_, __):
ray.shutdown()
-@patch("atom.basetrainer.Parallel")
-def test_parallel(_):
+@patch("atom.basetrainer.Parallel", MagicMock())
+def test_parallel():
"""Assert that parallel runs successfully."""
trainer = DirectClassifier(
models=["LR", "LDA"],
diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py
index 8ff86674c..a738570bc 100644
--- a/tests/test_basetransformer.py
+++ b/tests/test_basetransformer.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -35,6 +33,7 @@
# Test properties ================================================== >>
+
def test_n_jobs_maximum_cores():
"""Assert that value equals n_cores if maximum is exceeded."""
base = BaseTransformer(n_jobs=1000)
@@ -123,7 +122,7 @@ def test_warnings_parameter_str():
@pytest.mark.parametrize("logger", [None, "auto", Path("test"), Logger("test")])
def test_logger_creator(logger):
"""Assert that the logger is created correctly."""
- base = BaseTransformer(logger="auto")
+ base = BaseTransformer(logger=logger)
assert isinstance(base.logger, Logger | None)
@@ -143,14 +142,13 @@ def test_experiment_creation(mlflow):
mlflow.assert_called_once()
-@patch("mlflow.set_experiment")
@patch("dagshub.auth.get_token")
@patch("requests.get")
@patch("dagshub.init")
-def test_experiment_dagshub(dagshub, request, token, _):
+def test_experiment_dagshub(dagshub, request, token):
"""Assert that the experiment can be stored in dagshub."""
token.return_value = "token"
- request.return_value.text = dict(username="user1")
+ request.return_value.text = {"username": "user1"}
BaseTransformer(experiment="dagshub:test")
dagshub.assert_called_once()
@@ -181,6 +179,7 @@ def test_device_id_invalid():
# Test _inherit ==================================================== >>
+
def test_inherit():
"""Assert that the inherit method passes the parameters correctly."""
base = BaseTransformer(random_state=2)
@@ -190,6 +189,7 @@ def test_inherit():
# Test _get_est_class ============================================== >>
+
@pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support")
def test_get_est_class_from_engine():
"""Assert that the class can be retrieved from an engine."""
@@ -205,10 +205,12 @@ def test_get_est_class_from_default():
# Test _check_input ============================================== >>
+
def test_input_is_copied():
"""Assert that the data is copied."""
X, y = BaseTransformer._check_input(X_bin, y_bin)
- assert X is not X_bin and y is not y_bin
+ assert X is not X_bin
+ assert y is not y_bin
def test_input_X_and_y_None():
@@ -226,7 +228,8 @@ def test_X_is_callable():
def test_to_pandas():
"""Assert that the data provided is converted to pandas objects."""
X, y = BaseTransformer._check_input(X_bin_array, y_bin_array)
- assert isinstance(X, pd.DataFrame) and isinstance(y, pd.Series)
+ assert isinstance(X, pd.DataFrame)
+ assert isinstance(y, pd.Series)
def test_column_order_is_retained():
@@ -254,7 +257,7 @@ def test_input_data_in_training():
train = bin_train.copy()
trainer = DirectClassifier("LR", random_state=1)
trainer.run(train, bin_test)
- train.iat[3, 2] = 99 # Change an item of the original variable
+ train.iloc[3, 2] = 99 # Change an item of the original variable
assert 99 not in trainer.dataset # Is unchanged in the pipeline
@@ -365,11 +368,13 @@ def test_target_is_none():
def test_X_empty_df():
"""Assert that X becomes an empty dataframe when provided but in y."""
X, y = BaseTransformer._check_input(y_fc, y=-1)
- assert X.empty and isinstance(y, pd.Series)
+ assert X.empty
+ assert isinstance(y, pd.Series)
# Test log ========================================================= >>
+
def test_log_severity_error():
"""Assert that an error is raised when the severity is error."""
with pytest.raises(UserWarning, match=".*user error.*"):
diff --git a/tests/test_branch.py b/tests/test_branch.py
index 5c6ff7ba7..fa4fd6269 100644
--- a/tests/test_branch.py
+++ b/tests/test_branch.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -27,6 +25,7 @@
# Test Branch ====================================================== >>
+
def test_init_empty_pipeline():
"""Assert that an empty branch has an empty pipeline."""
branch = Branch(name="main")
@@ -166,7 +165,7 @@ def test_shape_property():
def test_columns_property():
"""Assert that the columns property returns the columns of the dataset."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
- assert list(atom.branch.columns) == list(X_bin.columns) + [y_bin.name]
+ assert list(atom.branch.columns) == [*X_bin.columns, y_bin.name]
def test_n_columns_property():
@@ -203,11 +202,11 @@ def test_all_property():
def test_dataset_setter():
"""Assert that the dataset setter changes the whole dataset."""
new_dataset = merge(X_bin, y_bin)
- new_dataset.iat[0, 3] = 4 # Change one value
+ new_dataset.iloc[0, 3] = 4 # Change one value
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.dataset = new_dataset
- assert atom.dataset.iat[0, 3] == 4 # Check the value is changed
+ assert atom.dataset.iloc[0, 3] == 4 # Check the value is changed
def test_train_setter():
@@ -234,8 +233,8 @@ def test_X_setter():
def test_y_setter():
"""Assert that the y setter changes the target column."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
- assert atom.y[0] == 0 # First value is 1 in original
- atom.y = [1] + list(y_bin.values[1:])
+ assert atom.y[0] == 0 # The first value is 1 in original
+ atom.y = [1, *y_bin.values[1:]]
assert atom.y[0] == 1 # First value changed to 0
@@ -243,9 +242,9 @@ def test_X_train_setter():
"""Assert that the X_train setter changes the training feature set."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
new_X_train = atom.X_train
- new_X_train.iat[0, 0] = 999
+ new_X_train.iloc[0, 0] = 999
atom.X_train = new_X_train.to_numpy() # To numpy to test dtypes are maintained
- assert atom.X_train.iat[0, 0] == 999
+ assert atom.X_train.iloc[0, 0] == 999
assert list(atom.X_train.dtypes) == list(atom.X_test.dtypes)
@@ -253,25 +252,25 @@ def test_X_test_setter():
"""Assert that the X_test setter changes the test feature set."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
new_X_test = atom.X_test
- new_X_test.iat[0, 0] = 999
+ new_X_test.iloc[0, 0] = 999
atom.X_test = new_X_test
- assert atom.X_test.iat[0, 0] == 999
+ assert atom.X_test.iloc[0, 0] == 999
def test_y_train_setter():
"""Assert that the y_train setter changes the training target column."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
- assert atom.y_train.iat[0] == 0 # First value is 1 in original
- atom.y_train = [1] + list(atom.y_train.values[1:])
- assert atom.y_train.iat[0] == 1 # First value changed to 0
+ assert atom.y_train.iloc[0] == 0 # The first value is 1 in original
+ atom.y_train = [1, *atom.y_train[1:]]
+ assert atom.y_train.iloc[0] == 1 # First value changed to 0
def test_y_test_setter():
"""Assert that the y_test setter changes the training target column."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
- assert atom.y_test.iat[0] == 1 # First value is 0 in original
- atom.y_test = [0] + list(atom.y_test[1:])
- assert atom.y_test.iat[0] == 0 # First value changed to 1
+ assert atom.y_test.iloc[0] == 1 # The first value is 0 in original
+ atom.y_test = [0, *atom.y_test[1:]]
+ assert atom.y_test.iloc[0] == 0 # First value changed to 1
def test_data_properties_to_df():
@@ -305,27 +304,27 @@ def test_setter_error_unequal_index():
def test_setter_error_unequal_columns():
"""Assert that an error is raised when the setter has unequal columns."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
+ new_X = atom.train
+ new_X.insert(0, "new_column", 1)
with pytest.raises(ValueError, match="number of columns"):
- new_X = atom.train
- new_X.insert(0, "new_column", 1)
atom.train = new_X
def test_setter_error_unequal_column_names():
"""Assert that an error is raised with different column names."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
+ new_X = atom.train.drop(columns=atom.train.columns[0])
+ new_X.insert(0, "new_column", 1)
with pytest.raises(ValueError, match="the same columns"):
- new_X = atom.train.drop(columns=atom.train.columns[0])
- new_X.insert(0, "new_column", 1)
atom.train = new_X
def test_setter_error_unequal_target_names():
"""Assert that an error is raised with different target names."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
+ new_y_train = atom.y_train
+ new_y_train.name = "different_name"
with pytest.raises(ValueError, match="the same name"):
- new_y_train = atom.y_train
- new_y_train.name = "different_name"
atom.y_train = new_y_train
@@ -565,6 +564,7 @@ def test_load_no_dir():
# Test BranchManager =============================================== >>
+
def test_branchmanager_repr():
"""Assert that the __repr__ method returns the branches."""
assert str(BranchManager()) == "BranchManager([main], og=main)"
@@ -577,7 +577,7 @@ def test_branchmanager_len():
def test_branchmanager_iter():
"""Assert that the __iter__ method iterates over the branches."""
- assert str(list(b for b in BranchManager())[0]) == "Branch(main)"
+ assert str(next(iter(BranchManager()))) == "Branch(main)"
def test_branchmanager_contains():
diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py
index 24bbece35..0f7a49acd 100644
--- a/tests/test_data_cleaning.py
+++ b/tests/test_data_cleaning.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -30,6 +28,7 @@
# Test TransformerMixin ============================================ >>
+
def test_clone():
"""Assert that cloning the transformer keeps internal attributes."""
pruner = Pruner().fit(X_bin)
@@ -57,6 +56,7 @@ def test_inverse_transform():
# Test Balancer ==================================================== >>
+
def test_balance_multioutput_task():
"""Assert that an error is raised for multioutput tasks."""
with pytest.raises(ValueError, match=".*not support multioutput.*"):
@@ -144,6 +144,7 @@ def test_balancer_attach_attribute():
# Test Cleaner ==================================================== >>
+
def test_cleaner_convert_dtypes():
"""Assert that column dtypes are converted."""
X = X_bin.copy()
@@ -176,15 +177,15 @@ def test_cleaner_drop_invalid_column_list_types():
def test_cleaner_remove_characters_from_column_names():
"""Assert that specified chars are removed from column names."""
X, y = X_bin.copy(), y_bin.copy()
- X.columns = ["test##"] + list(X.columns[1:])
+ X.columns = ["test##", *X.columns[1:]]
y.name = "::test"
X, y = Cleaner(drop_chars="[^A-Za-z0-9]+").fit_transform(X, y)
assert X.columns[0] == "test"
assert y.name == "test"
X, y = X_class.copy(), y_multiclass.copy()
- X.columns = ["test##"] + list(X.columns[1:])
- y.columns = ["::test"] + list(y.columns[1:])
+ X.columns = ["test##", *X.columns[1:]]
+ y.columns = ["::test", *y.columns[1:]]
X, y = Cleaner(drop_chars="[^A-Za-z0-9]+").fit_transform(X, y)
assert X.columns[0] == "test"
assert y.columns[0] == "test"
@@ -234,7 +235,7 @@ def test_cleaner_multiclass_multioutput():
y = pd.DataFrame({"a": y10_str, "b": y10, "c": y10_str})
y_transformed = Cleaner().fit_transform(y=y)
assert list(y_transformed.columns) == ["a", "b", "c"]
- assert all(v in [0, 1] for v in y_transformed.values.ravel())
+ assert all(v in (0, 1) for v in y_transformed.to_numpy().ravel())
def test_cleaner_inverse_transform():
@@ -260,6 +261,7 @@ def test_cleaner_target_mapping_binary():
# Test Discretizer ================================================= >>
+
def test_missing_columns_in_dict_are_ignored():
"""Assert that only columns in the dict are transformed."""
discretizer = Discretizer(strategy="uniform", bins={"mean radius": 5})
@@ -345,6 +347,7 @@ def test_labels_custom_strategy():
# Test Encoder ===================================================== >>
+
def test_strategy_parameter_encoder():
"""Assert that the strategy parameter is set correctly."""
encoder = Encoder(strategy="invalid")
@@ -378,20 +381,20 @@ def test_encoder_custom_estimator():
"""Assert that the strategy can be a custom estimator."""
encoder = Encoder(strategy=TargetEncoder, max_onehot=None)
X = encoder.fit_transform(X10_str, y10)
- assert X.at[0, "x2"] != "a"
+ assert X.loc[0, "x2"] != "a"
def test_missing_values_are_propagated():
"""Assert that missing values are propagated."""
encoder = Encoder(max_onehot=None)
- assert np.isnan(encoder.fit_transform(X10_sn, y10).iat[0, 2])
+ assert np.isnan(encoder.fit_transform(X10_sn, y10).iloc[0, 2])
def test_unknown_classes_are_imputed():
"""Assert that unknown classes are imputed."""
encoder = Encoder()
encoder.fit(["a", "b", "b", "a"])
- assert encoder.transform(["c"]).iat[0, 0] == -1.0
+ assert encoder.transform(["c"]).iloc[0, 0] == -1.0
def test_ordinal_encoder():
@@ -406,7 +409,8 @@ def test_ordinal_features():
"""Assert that ordinal features are encoded."""
encoder = Encoder(max_onehot=None, ordinal={"x2": ["b", "a", "c"]})
X = encoder.fit_transform(X10_str2, y10)
- assert X.iat[0, 2] == 1 and X.iat[2, 2] == 0
+ assert X.iloc[0, 2] == 1
+ assert X.iloc[2, 2] == 0
def test_one_hot_encoder():
@@ -433,6 +437,7 @@ def test_kwargs_parameters():
# Test Imputer ===================================================== >>
+
@pytest.mark.parametrize("missing", [None, np.NaN, np.inf, -np.inf, 99])
def test_imputing_all_missing_values_numeric(missing):
"""Assert that all missing values are imputed in numeric columns."""
@@ -455,12 +460,12 @@ def test_imputing_all_missing_values_categorical(missing):
@pytest.mark.parametrize("max_nan_rows", [5, 0.5])
-def test_rows_too_many_nans(max_nan_rows):
+def test_rows_too_many_nans(max_nan_rows, random):
"""Assert that rows with too many missing values are dropped."""
X = X_bin.copy()
- for i in range(5): # Add 5 rows with all NaN values
+ for _ in range(5): # Add 5 rows with all NaN values
X.loc[len(X)] = [np.nan for _ in range(X.shape[1])]
- y = [np.random.randint(2) for _ in range(len(X))]
+ y = [random.integers(2) for _ in range(len(X))]
imputer = Imputer(
strat_num="mean",
strat_cat="most_frequent",
@@ -499,7 +504,7 @@ def test_imputing_numeric_number():
"""Assert that imputing a number for numerical values works."""
imputer = Imputer(strat_num=3.2)
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == 3.2
+ assert X.iloc[0, 0] == 3.2
assert X.isna().sum().sum() == 0
@@ -507,7 +512,7 @@ def test_imputing_numeric_mean():
"""Assert that imputing the mean for numerical values works."""
imputer = Imputer(strat_num="mean")
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == pytest.approx(2.577778, rel=1e-6, abs=1e-12)
+ assert X.iloc[0, 0] == pytest.approx(2.577778, rel=1e-6, abs=1e-12)
assert X.isna().sum().sum() == 0
@@ -515,7 +520,7 @@ def test_imputing_numeric_median():
"""Assert that imputing the median for numerical values works."""
imputer = Imputer(strat_num="median")
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == 3
+ assert X.iloc[0, 0] == 3
assert X.isna().sum().sum() == 0
@@ -523,7 +528,7 @@ def test_imputing_numeric_knn():
"""Assert that imputing numerical values with KNNImputer works."""
imputer = Imputer(strat_num="knn", random_state=1)
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == 3.04
+ assert X.iloc[0, 0] == 3.04
assert X.isna().sum().sum() == 0
@@ -531,7 +536,7 @@ def test_imputing_numeric_iterative():
"""Assert that imputing numerical values with IterativeImputer works."""
imputer = Imputer(strat_num="iterative")
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == pytest.approx(2.577836, rel=1e-6, abs=1e-12)
+ assert X.iloc[0, 0] == pytest.approx(2.577836, rel=1e-6, abs=1e-12)
assert X.isna().sum().sum() == 0
@@ -539,7 +544,7 @@ def test_imputing_numeric_most_frequent():
"""Assert that imputing the most_frequent for numerical values works."""
imputer = Imputer(strat_num="most_frequent")
X, y = imputer.fit_transform(X10_nan, y10)
- assert X.iat[0, 0] == 3
+ assert X.iloc[0, 0] == 3
assert X.isna().sum().sum() == 0
@@ -547,7 +552,7 @@ def test_imputing_non_numeric_string():
"""Assert that imputing a string for non-numerical values works."""
imputer = Imputer(strat_cat="missing")
X, y = imputer.fit_transform(X10_sn, y10)
- assert X.iat[0, 2] == "missing"
+ assert X.iloc[0, 2] == "missing"
assert X.isna().sum().sum() == 0
@@ -563,12 +568,13 @@ def test_imputing_non_numeric_most_frequent():
"""Assert that the most_frequent strategy for non-numerical works."""
imputer = Imputer(strat_cat="most_frequent")
X, y = imputer.fit_transform(X10_sn, y10)
- assert X.iat[0, 2] == "d"
+ assert X.iloc[0, 2] == "d"
assert X.isna().sum().sum() == 0
# Test Normalizer ======================================================= >>
+
@pytest.mark.parametrize("strategy", ["yeojohnson", "boxcox", "quantile"])
def test_normalizer_all_strategies(strategy):
"""Assert that all strategies work as intended."""
@@ -613,7 +619,7 @@ def test_normalizer_ignores_categorical_columns():
X = X_bin.copy()
X.insert(1, "categorical_col_1", ["a" for _ in range(len(X))])
X = Normalizer().fit_transform(X)
- assert list(X[X.columns.values[1]]) == ["a" for _ in range(len(X))]
+ assert list(X[X.columns[1]]) == ["a" for _ in range(len(X))]
def test_normalizer_attach_attribute():
@@ -625,6 +631,7 @@ def test_normalizer_attach_attribute():
# Test Pruner ====================================================== >>
+
def test_invalid_method_for_non_z_score():
"""Assert that an error is raised for an invalid method and strat combination."""
pruner = Pruner(strategy="iforest", method="minmax")
@@ -657,15 +664,15 @@ def test_drop_pruner():
def test_minmax_pruner():
"""Assert that the method works as intended when strategy="minmax"."""
X = Pruner(method="minmax", max_sigma=2).transform(X10)
- assert X.iat[3, 0] == 0.23 # Max of column
- assert X.iat[5, 1] == 2 # Min of column
+ assert X.iloc[3, 0] == 0.23 # Max of column
+ assert X.iloc[5, 1] == 2 # Min of column
def test_value_pruner():
"""Assert that the method works as intended when strategy=value."""
X = Pruner(method=-99, max_sigma=2).transform(X10)
- assert X.iat[3, 0] == -99
- assert X.iat[5, 1] == -99
+ assert X.iloc[3, 0] == -99
+ assert X.iloc[5, 1] == -99
def test_categorical_cols_are_ignored():
@@ -721,6 +728,7 @@ def test_pruner_attach_attribute():
# Test Scaler ====================================================== >>
+
@pytest.mark.parametrize("strategy", ["standard", "minmax", "maxabs", "robust"])
def test_scaler_all_strategies(strategy):
"""Assert that all strategies work as intended."""
@@ -745,7 +753,8 @@ def test_scaler_y_is_ignored():
def test_scaler_kwargs():
"""Assert that kwargs can be passed to the estimator."""
X = Scaler(strategy="minmax", feature_range=(1, 2)).fit_transform(X_bin)
- assert min(X.iloc[:, 0]) >= 1 and max(X.iloc[:, 0]) <= 2
+ assert min(X.iloc[:, 0]) >= 1
+ assert max(X.iloc[:, 0]) <= 2
def test_scaler_return_scaled_dataset():
@@ -773,7 +782,7 @@ def test_scaler_ignores_categorical_columns():
X = X_bin.copy()
X.insert(1, "categorical_col_1", ["a" for _ in range(len(X))])
X = Scaler().fit_transform(X)
- assert list(X[X.columns.values[1]]) == ["a" for _ in range(len(X))]
+ assert list(X[X.columns[1]]) == ["a" for _ in range(len(X))]
def test_scaler_attach_attribute():
diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py
index 6374f711e..9e0645286 100644
--- a/tests/test_ensembles.py
+++ b/tests/test_ensembles.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -24,38 +22,51 @@
from .conftest import X_bin, X_reg, y_bin, y_reg
-@pytest.fixture
+@pytest.fixture()
def classifiers():
"""Get a list of classifiers for the ensemble."""
return [
("lda", LinearDiscriminantAnalysis().fit(X_bin, y_bin)),
("placeholder1", "drop"),
- ("pl", Pipeline(
- [("scaler", StandardScaler()), ("et", ExtraTreesClassifier(n_estimators=5))]
- ).fit(X_bin, y_bin)),
+ (
+ "pl",
+ Pipeline(
+ [
+ ("scaler", StandardScaler()),
+ ("et", ExtraTreesClassifier(n_estimators=5)),
+ ],
+ ).fit(X_bin, y_bin),
+ ),
]
-@pytest.fixture
+@pytest.fixture()
def regressors():
"""Get a list of regressors for the ensemble."""
return [
("ols", LinearRegression()),
("placeholder1", "drop"),
- ("pl", Pipeline(
- [("scaler", StandardScaler()), ("et", ExtraTreesRegressor(n_estimators=5))]
- )),
+ (
+ "pl",
+ Pipeline(
+ [
+ ("scaler", StandardScaler()),
+ ("et", ExtraTreesRegressor(n_estimators=5)),
+ ],
+ ),
+ ),
]
# Stacking ========================================================= >>
+
def test_stacking_classifier(classifiers):
"""Assert that stacking classifiers work."""
stack = StackingClassifier(estimators=classifiers, cv=KFold())
- assert not check_is_fitted(stack, False)
+ assert not check_is_fitted(stack, exception=False)
stack.fit(X_bin, y_bin)
- assert check_is_fitted(stack, False)
+ assert check_is_fitted(stack, exception=False)
assert len(stack.estimators_) == 2
assert stack.estimators_[0] is classifiers[0][1] # Fitted is same
assert stack.estimators_[1] is not classifiers[1][1] # Unfitted changes
@@ -64,18 +75,19 @@ def test_stacking_classifier(classifiers):
def test_stacking_regressor(regressors):
"""Assert that stacking regressors."""
stack = StackingRegressor(estimators=regressors)
- assert not check_is_fitted(stack, False)
+ assert not check_is_fitted(stack, exception=False)
stack.fit(X_reg, y_reg)
- assert check_is_fitted(stack, False)
+ assert check_is_fitted(stack, exception=False)
assert len(stack.estimators_) == 2
# Voting =========================================================== >>
+
def test_voting_initialized_fitted(classifiers):
"""Assert that the model can be fit at initialization."""
vote = VotingClassifier(estimators=classifiers)
- assert check_is_fitted(vote, False)
+ assert check_is_fitted(vote, exception=False)
def test_voting_multilabel(classifiers):
@@ -105,9 +117,9 @@ def test_voting_mixed_fit_and_not(classifiers):
estimators.append(("not_fitted_lda", LinearDiscriminantAnalysis()))
vote = VotingClassifier(estimators=estimators)
- assert not check_is_fitted(vote, False)
+ assert not check_is_fitted(vote, exception=False)
vote.fit(X_bin, y_bin)
- assert check_is_fitted(vote, False)
+ assert check_is_fitted(vote, exception=False)
assert len(vote.estimators_) == 3
assert vote.estimators_[0] is estimators[0][1] # Fitted is same
assert vote.estimators_[2] is not estimators[2][1] # Unfitted changes
@@ -123,6 +135,6 @@ def test_voting_predict(classifiers, voting):
def test_voting_regressor(regressors):
"""Assert that the regressor works."""
vote = VotingRegressor(estimators=regressors)
- assert not check_is_fitted(vote, False)
+ assert not check_is_fitted(vote, exception=False)
vote.fit(X_reg, y_reg)
- assert check_is_fitted(vote, False)
+ assert check_is_fitted(vote, exception=False)
diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py
index aac16810c..f93e1599e 100644
--- a/tests/test_feature_engineering.py
+++ b/tests/test_feature_engineering.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -24,6 +22,7 @@
# Test FeatureExtractor ============================================ >>
+
def test_invalid_features():
"""Assert that an error is raised when features are invalid."""
with pytest.raises(ValueError, match=".*an attribute of pd.Series.dt.*"):
@@ -72,16 +71,19 @@ def test_order_features():
assert X.columns.get_loc("x2_year") == 4
-@pytest.mark.parametrize("fxs", [
- ("microsecond", "%f"),
- ("second", "%S"),
- ("hour", "%H"),
- ("weekday", "%d/%m/%Y"),
- ("day", "%d/%m/%Y"),
- ("dayofyear", "%d/%m/%Y"),
- ("month", "%d/%m/%Y"),
- ("quarter", "%d/%m/%Y"),
-])
+@pytest.mark.parametrize(
+ "fxs",
+ [
+ ("microsecond", "%f"),
+ ("second", "%S"),
+ ("hour", "%H"),
+ ("weekday", "%d/%m/%Y"),
+ ("day", "%d/%m/%Y"),
+ ("dayofyear", "%d/%m/%Y"),
+ ("month", "%d/%m/%Y"),
+ ("quarter", "%d/%m/%Y"),
+ ],
+)
def test_all_cyclic_features(fxs):
"""Assert that all cyclic columns create two features."""
extractor = FeatureExtractor(features=fxs[0], fmt=fxs[1], encoding_type="cyclic")
@@ -99,6 +101,7 @@ def test_features_are_not_dropped():
# Test FeatureGenerator ============================================ >>
+
def test_n_features_above_maximum():
"""Assert that n_features becomes maximum if more than maximum."""
generator = FeatureGenerator(
@@ -158,11 +161,13 @@ def test_default_feature_names():
random_state=1,
)
X = generator.fit_transform(X, y_bin)
- assert "x30" not in X and "x32" in X
+ assert "x32" in X
+ assert "x30" not in X
# Test FeatureGrouper ============================================= >>
+
def test_operator_not_in_libraries():
"""Assert that an error is raised when an operator is not in np or stats."""
grouper = FeatureGrouper({"g1": ["mean radius", "mean texture"]}, operators="invalid")
@@ -201,6 +206,7 @@ def test_columns_are_kept():
# Test FeatureSelector ============================================= >>
+
def test_solver_parameter_empty():
"""Assert that an error is raised when solver is None."""
selector = FeatureSelector(strategy="sfm", solver=None)
@@ -276,7 +282,8 @@ def test_remove_collinear_without_y():
X["invalid"] = list(range(len(X)))
selector = FeatureSelector(max_correlation=1)
X = selector.fit_transform(X)
- assert "valid" in X and "invalid" not in X
+ assert "valid" in X
+ assert "invalid" not in X
assert hasattr(selector, "collinear")
@@ -542,7 +549,7 @@ def test_advanced_custom_objective_function():
selector = FeatureSelector(
strategy="gwo",
solver="tree_class",
- objective_function=lambda *args: 1,
+ objective_function=lambda *args: 1, # noqa: ARG005
n_iteration=1,
population_size=1,
)
diff --git a/tests/test_models.py b/tests/test_models.py
index 668951f83..c14d0e40a 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -10,7 +8,6 @@
from platform import machine
from unittest.mock import Mock, patch
-import numpy as np
import pandas as pd
import pytest
from optuna.distributions import IntDistribution
@@ -129,11 +126,12 @@ def test_models_sklearnex_regression():
@patch.dict(
- "sys.modules", {
+ "sys.modules",
+ {
"cuml": Mock(spec=["__spec__"]),
"cuml.common.device_selection": Mock(spec=["set_global_device_type"]),
"cuml.internals.memory_utils": Mock(spec=["set_global_output_type"]),
- }
+ },
)
def test_models_cuml_classification():
"""Assert that all classification models can be called with cuml."""
@@ -158,11 +156,12 @@ def test_models_cuml_classification():
@patch.dict(
- "sys.modules", {
+ "sys.modules",
+ {
"cuml": Mock(spec=["__spec__"]),
"cuml.common.device_selection": Mock(spec=["set_global_device_type"]),
"cuml.internals.memory_utils": Mock(spec=["set_global_output_type"]),
- }
+ },
)
def test_models_cuml_regression():
"""Assert that all regression models can be called with cuml."""
@@ -184,10 +183,10 @@ def test_models_cuml_regression():
)
-def test_CatNB():
+def test_CatNB(random):
"""Assert that the CatNB model works. Needs special dataset."""
- X = np.random.randint(2, size=(150, 10))
- y = np.random.randint(2, size=150)
+ X = random.integers(2, size=(150, 10))
+ y = random.integers(2, size=150)
atom = ATOMClassifier(X, y, random_state=1)
assert atom.scaled # Check scaling is True for all binary columns
@@ -213,7 +212,7 @@ def test_pruning_non_sklearn(model):
est_params={"n_estimators": 10, "max_depth": 2},
ht_params={"pruner": PatientPruner(None, patience=1)},
)
- assert "PRUNED" in atom.winner.trials["state"].values
+ assert "PRUNED" in atom.winner.trials["state"].unique()
@pytest.mark.parametrize("model", ["CatB", "LGB", "XGB"])
@@ -257,7 +256,7 @@ def test_MLP_custom_n_layers():
"hidden_layer_2": IntDistribution(0, 4),
"hidden_layer_3": IntDistribution(0, 4),
"hidden_layer_4": IntDistribution(0, 4),
- }
+ },
},
)
assert "hidden_layer_1" in atom.mlp.trials
@@ -265,6 +264,7 @@ def test_MLP_custom_n_layers():
# Test ensembles =================================================== >>
+
def test_ensemble_failed_feature_importance():
"""Assert that the Stacking model works."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 5e6b8c639..70d4c6639 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -20,6 +18,7 @@
# Test TextCleaner ================================================= >>
+
def test_corpus_is_not_present():
"""Assert that an error is raised when there is no corpus."""
with pytest.raises(ValueError, match=".*contain a column named corpus.*"):
@@ -93,6 +92,7 @@ def test_drop_empty_tokens():
# Test Tokenizer =================================================== >>
+
def test_tokenization():
"""Assert that the corpus is tokenized."""
X = Tokenizer().transform([["A test"]])
@@ -133,6 +133,7 @@ def test_no_ngrams():
# Test TextNormalizer ================================================== >>
+
def test_normalizer_space_separation():
"""Assert that the corpus is separated by space if not tokenized."""
assert TextNormalizer().transform([["b c"]])["corpus"][0] == ["b", "c"]
@@ -164,6 +165,7 @@ def test_lemmatization():
# Test Vectorizer ================================================== >>
+
def test_vectorizer_space_separation():
"""Assert that the corpus is separated by space if not tokenized."""
assert "corpus_hi" in Vectorizer().fit_transform({"corpus": [["hi"], ["hi"]]})
@@ -185,12 +187,13 @@ def test_hashing():
@patch.dict(
- "sys.modules", {
+ "sys.modules",
+ {
"cuml": MagicMock(spec=["__spec__"]),
"cuml.common.device_selection": MagicMock(spec=["set_global_device_type"]),
"cuml.internals.memory_utils": MagicMock(spec=["set_global_output_type"]),
"cuml.feature_extraction.text": MagicMock(),
- }
+ },
)
def test_gpu():
"""Assert that the gpu implementation calls the get method of matrix."""
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 584d768ca..038a9d67f 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -19,7 +17,7 @@
from .conftest import X_bin, y_bin
-@pytest.fixture
+@pytest.fixture()
def pipeline():
"""Get a pipeline from atom with/without a final estimator."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
diff --git a/tests/test_plots.py b/tests/test_plots.py
index 6f69d07a3..923b43875 100644
--- a/tests/test_plots.py
+++ b/tests/test_plots.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -9,7 +7,7 @@
import glob
from pathlib import Path
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
import numpy as np
import pandas as pd
@@ -31,6 +29,7 @@
# Test BaseFigure ================================================== >>
+
def test_get_elem():
"""Assert that elements are assigned correctly."""
base = BaseFigure()
@@ -41,6 +40,7 @@ def test_get_elem():
# Test BasePlot ==================================================== >>
+
def test_aesthetics():
"""Assert that the aesthetics getter works."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -156,11 +156,11 @@ def test_custom_title_and_legend(func):
"""Assert that title and legend can be customized."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run("Tree", errors="raise")
- atom.plot_roc(title=dict(text="test", x=0), legend=dict(font_color="red"))
+ atom.plot_roc(title={"text": "test", "x": 0}, legend={"font_color": "red"})
func.assert_called_once()
-@pytest.mark.parametrize("legend", Legend.__args__) # type: ignore
+@pytest.mark.parametrize("legend", Legend.__args__)
def test_custom_legend_position(legend):
"""Assert that the legend position can be specified."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -222,7 +222,7 @@ def test_canvas():
atom = ATOMRegressor(X_reg, y_reg, random_state=1)
atom.run("Tree")
with atom.canvas(1, 2, title="Title", display=False) as fig:
- atom.plot_residuals(title=dict(text="Residuals plot", x=0))
+ atom.plot_residuals(title={"text": "Residuals plot", "x": 0})
atom.plot_feature_importance(title="Feature importance plot")
assert fig.__class__.__name__ == "Figure"
@@ -276,6 +276,7 @@ def test_update_traces():
# Test DataPlot ==================================================== >>
+
@pytest.mark.parametrize("show", [10, None])
def test_plot_components(show):
"""Assert that the plot_components method works."""
@@ -339,7 +340,7 @@ def test_plot_relationships():
@pytest.mark.parametrize("scoring", [None, "auc"])
def test_plot_rfecv(scoring):
- """Assert that the plot_rfecv method works """
+ """Assert that the plot_rfecv method works."""
atom = ATOMClassifier(X_bin, y_bin, n_rows=0.1, random_state=1)
# Didn't run RFECV
@@ -360,6 +361,7 @@ def test_plot_wordcloud():
# Test HyperparameterTuningPlot ==================================== >>
+
def test_check_hyperparams():
"""Assert that an error is raised when models didn't run HT."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -481,6 +483,7 @@ def test_plot_trials():
# Test PredictionPlot =================================================== >>
+
def test_plot_calibration():
"""Assert that the plot_calibration method works."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
@@ -582,9 +585,9 @@ def test_plot_parshap():
atom.dummy.plot_parshap(display=False) # Without colorbar
-@patch("atom.plots.predictionplot.Parallel")
-@patch("atom.plots.predictionplot.partial_dependence")
-def test_plot_partial_dependence(_, __):
+@patch("atom.plots.predictionplot.Parallel", MagicMock())
+@patch("atom.plots.predictionplot.partial_dependence", MagicMock())
+def test_plot_partial_dependence():
"""Assert that the plot_partial_dependence method works."""
atom = ATOMClassifier(X_label, y=y_label, stratify=False, random_state=1)
atom.run("Tree")
@@ -632,7 +635,7 @@ def test_plot_pipeline():
atom.plot_pipeline(models="invalid", display=False)
# Called from a canvas
- with pytest.raises(PermissionError, match=".*called from a canvas.*"):
+ with pytest.raises(PermissionError, match=".*a canvas.*"): # noqa: PT012
with atom.canvas(2, 1, display=False):
atom.plot_results(display=False)
atom.plot_pipeline(display=False)
@@ -742,6 +745,7 @@ def test_plot_threshold_multilabel():
# Test ShapPlot ==================================================== >>
+
def test_plot_shap_fail():
"""Assert that an error is raised when the explainer can't be created."""
atom = ATOMClassifier(X_class, y=y_multiclass, random_state=1)
@@ -809,8 +813,7 @@ def test_plot_shap_heatmap():
atom.plot_shap_heatmap(display=False)
-@pytest.mark.parametrize("feature", [0, -1, "mean texture"])
-def test_plot_shap_scatter(feature):
+def test_plot_shap_scatter():
"""Assert that the plot_shap_scatter method works."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
atom.run("LR")
diff --git a/tests/test_training.py b/tests/test_training.py
index fd44f3b87..5e7296890 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs
@@ -19,6 +17,7 @@
# Test trainers ============================================== >>
+
def test_sh_skip_runs_too_large():
"""Assert that an error is raised if skip_runs >= n_runs."""
sh = SuccessiveHalvingRegressor(models=["OLS", "BR"], skip_runs=2)
@@ -37,7 +36,7 @@ def test_models_are_restored():
random_state=1,
)
sh.run(reg_train, reg_test)
- assert "Tree" not in sh._models # Original model is deleted
+ assert "Tree" not in sh._models # The original model is deleted
assert all(m in sh.models for m in ("Tree4", "AdaB2", "LGB1"))
@@ -59,6 +58,7 @@ def test_ts_different_train_sizes_types():
# Test goals ======================================================= >>
+
def test_goals_trainers():
"""Assert that the goal of every Trainer class is set correctly."""
trainer = DirectClassifier("LR")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3d3e84305..20446a53b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
"""Automated Tool for Optimized Modeling (ATOM).
Author: Mavs