add ignore features

tvdboom · Dec 7, 2023 · b194a56 · b194a56
1 parent d7a225e
commit b194a56
Show file tree

Hide file tree

Showing 331 changed files with 642 additions and 875 deletions.
diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml
@@ -11,21 +11,9 @@ on:
       - development
 
 jobs:
-  isort:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Check imports
-        uses: jamescurtin/isort-action@master
-        with:
-          sortPaths: atom tests
+  # Run jobs from pre-commit to ensure version equality
 
-  flake8:
+  isort:
     runs-on: ubuntu-latest
     steps:
       - name: Check out source repository
@@ -35,11 +23,11 @@ jobs:
         with:
           python-version: "3.11"
       - name: Install dependencies
-        run: pip install flake8 flake8-pyproject
-      - name: Apply linting
-        run: flake8 --show-source --statistics atom tests
+        run: pip install -U pip pre-commit
+      - name: Apply isort
+        run: pre-commit run isort --all-files
 
-  pydocstyle:
+  ruff:
     runs-on: ubuntu-latest
     steps:
       - name: Check out source repository
@@ -49,9 +37,9 @@ jobs:
         with:
           python-version: "3.11"
       - name: Install dependencies
-        run: pip install pydocstyle
-      - name: Apply docstring check
-        run: pydocstyle atom tests
+        run: pip install -U pip pre-commit
+      - name: Apply linting
+        run: pre-commit run ruff --all-files
 
   mypy:
     runs-on: ubuntu-latest
@@ -63,11 +51,9 @@ jobs:
         with:
           python-version: "3.11"
       - name: Install dependencies
-        run: |
-          pip install -U pip
-          pip install -U mypy types-requests pandas-stubs beartype
+        run: pip install -U pip pre-commit
       - name: Check type hints
-        run: mypy atom tests
+        run: pre-commit run mypy --all-files
 
   code-quality-codeql:
     runs-on: ubuntu-latest

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,14 +13,6 @@ repos:
       args: ["--fix"]
       files: ^atom/.*\.py$|tests/.*\.py$
 
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-    - id: check-yaml
-    - id: end-of-file-fixer
-    - id: mixed-line-ending
-    - id: check-merge-conflict
-
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.7.1
     hooks:

diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@
 **Release** | [![pdm-managed](https://img.shields.io/badge/pdm-managed-blueviolet)](https://pdm.fming.dev) [![PyPI version](https://img.shields.io/pypi/v/atom-ml)](https://pypi.org/project/atom-ml/) [![Conda Version](https://img.shields.io/conda/vn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml) [![DOI](https://zenodo.org/badge/195069958.svg)](https://zenodo.org/badge/latestdoi/195069958)
 **Compatibility** | [![Python 3.10\|3.11](https://img.shields.io/badge/python-3.10%20%7C%203.11-blue?logo=python)](https://www.python.org) [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml)
 **Build status** | [![Build Status](https://github.com/tvdboom/ATOM/workflows/ATOM/badge.svg)](https://github.com/tvdboom/ATOM/actions) [![Azure Pipelines](https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/atom-ml-feedstock?branchName=master)](https://dev.azure.com/conda-forge/feedstock-builds/_build/latest?definitionId=10822&branchName=master) [![codecov](https://codecov.io/gh/tvdboom/ATOM/branch/master/graph/badge.svg)](https://codecov.io/gh/tvdboom/ATOM)
-**Code analysis** | [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/) [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) [![flake8](https://img.shields.io/badge/flake8-checked-blue)](https://flake8.pycqa.org/en/latest/) [![mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://www.mypy-lang.org/)
+**Code analysis** | [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/) [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) [![ruff](https://img.shields.io/badge/ruff-checked-blue)](https://docs.astral.sh/ruff/) [![mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://www.mypy-lang.org/)
 
 
 <br><br>

diff --git a/atom/api.py b/atom/api.py
@@ -17,8 +17,8 @@
 
 from atom.atom import ATOM
 from atom.utils.types import (
-    Backend, Bool, Engine, IndexSelector, IntLargerEqualZero, NJobs, Predictor,
-    Scalar, Verbose, Warnings, YSelector,
+    Backend, Bool, ColumnSelector, Engine, IndexSelector, IntLargerEqualZero,
+    NJobs, Predictor, Scalar, Verbose, Warnings, YSelector,
 )
 from atom.utils.utils import Goal
 
@@ -187,6 +187,10 @@ class ATOMClassifier(ATOM):
         - If str: Name of the column to use as index.
         - If sequence: Array with shape=(n_samples,) to use as index.
 
+    ignore: int, str, sequence or None, default=None
+        Features in X to ignore during data transformations and model
+        training. The features are still used in the remaining methods.
+
     test_size: int or float, default=0.2
         - If <=1: Fraction of the dataset to include in the test set.
         - If >1: Number of rows to include in the test set.
@@ -218,8 +222,8 @@ class ATOMClassifier(ATOM):
         This parameter is ignored if `shuffle=False` or if the test
         set is provided through `arrays`.
 
-        For [multioutput tasks][], stratification is applied to the
-        joint target columns.
+        For [multioutput tasks][], stratification applies to the joint
+        target columns.
 
     n_rows: int or float, default=1
         Random subsample of the dataset to use. The default value selects
@@ -348,6 +352,7 @@ def __init__(
         *arrays,
         y: YSelector = -1,
         index: IndexSelector = False,
+        ignore: ColumnSelector | None = None,
         shuffle: Bool = True,
         stratify: IndexSelector = True,
         n_rows: Scalar = 1,
@@ -368,6 +373,7 @@ def __init__(
             arrays=arrays,
             y=y,
             index=index,
+            ignore=ignore,
             test_size=test_size,
             holdout_size=holdout_size,
             shuffle=shuffle,
@@ -403,7 +409,7 @@ class ATOMForecaster(ATOM):
     Parameters
     ----------
     *arrays: sequence of indexables
-        Dataset containing exogeneous features and time series. Allowed
+        Dataset containing exogenous features and time series. Allowed
         formats are:
 
         - X
@@ -417,7 +423,7 @@ class ATOMForecaster(ATOM):
         - (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)
 
         **X, train, test: dataframe-like**<br>
-        Exogeneous feature set corresponding to y, with shape=(n_samples,
+        Exogenous feature set corresponding to y, with shape=(n_samples,
         n_features).
 
         **y: int, str or sequence**<br>
@@ -442,6 +448,11 @@ class ATOMForecaster(ATOM):
         This parameter is ignored if the time series is provided
         through `arrays`.
 
+    ignore: int, str, sequence or None, default=None
+        Exogenous features in X to ignore during data transformations
+        and model training. The features are still used in the remaining
+        methods.
+
     test_size: int or float, default=0.2
         - If <=1: Fraction of the dataset to include in the test set.
         - If >1: Number of rows to include in the test set.
@@ -580,6 +591,7 @@ def __init__(
         self,
         *arrays,
         y: YSelector = -1,
+        ignore: ColumnSelector | None = None,
         n_rows: Scalar = 1,
         test_size: Scalar = 0.2,
         holdout_size: Scalar | None = None,
@@ -598,6 +610,7 @@ def __init__(
             arrays=arrays,
             y=y,
             index=True,
+            ignore=ignore,
             test_size=test_size,
             holdout_size=holdout_size,
             shuffle=False,
@@ -678,6 +691,10 @@ class ATOMRegressor(ATOM):
         - If str: Name of the column to use as index.
         - If sequence: Array with shape=(n_samples,) to use as index.
 
+    ignore: int, str, sequence or None, default=None
+        Features in X to ignore during data transformations and model
+        training. The features are still used in the remaining methods.
+
     test_size: int or float, default=0.2
         - If <=1: Fraction of the dataset to include in the test set.
         - If >1: Number of rows to include in the test set.
@@ -825,6 +842,7 @@ def __init__(
         *arrays,
         y: YSelector = -1,
         index: IndexSelector = False,
+        ignore: ColumnSelector | None = None,
         shuffle: Bool = True,
         n_rows: Scalar = 1,
         test_size: Scalar = 0.2,
@@ -844,6 +862,7 @@ def __init__(
             arrays=arrays,
             y=y,
             index=index,
+            ignore=ignore,
             test_size=test_size,
             holdout_size=holdout_size,
             shuffle=shuffle,

diff --git a/atom/atom.py b/atom/atom.py
@@ -86,15 +86,15 @@ class ATOM(BaseRunner, ATOMPlot, metaclass=ABCMeta):
 
     @property
     @abstractmethod
-    def _goal(self) -> Goal:
-        ...
+    def _goal(self) -> Goal: ...
 
     def __init__(
         self,
         arrays,
         *,
         y: YSelector = -1,
         index: IndexSelector = False,
+        ignore: ColumnSelector | None = None,
         shuffle: Bool = True,
         stratify: IndexSelector = True,
         n_rows: Scalar = 1,
@@ -133,17 +133,18 @@ def __init__(
             holdout_size=holdout_size,
         )
 
-        self._missing = DEFAULT_MISSING
-
-        self._models = ClassMap()
-        self._metric = ClassMap()
-
         self._log("<< ================== ATOM ================== >>", 1)
 
         # Initialize the branch system and fill with data
         self._branches = BranchManager(memory=self.memory)
         self._branches.fill(*self._get_data(arrays, y=y))
 
+        self.ignore = ignore  # type: ignore[assignment]
+        self.missing = DEFAULT_MISSING
+
+        self._models = ClassMap()
+        self._metric = ClassMap()
+
         self._log("\nConfiguration ==================== >>", 1)
         self._log(f"Algorithm task: {self.task}.", 1)
         if self.n_jobs > 1:
@@ -266,10 +267,27 @@ def branch(self):
         self._branches.branches.remove(current)
         self._branches.current = self._branches[0].name
         self._log(
-            f"Branch {current} successfully deleted. Switched to branch {self.branch.name}.",
-            1,
+            f"Branch {current} successfully deleted. "
+            f"Switched to branch {self.branch.name}.", 1,
         )
 
+    @property
+    def ignore(self) -> tuple[str, ...]:
+        """Names of the ignored columns.
+
+        These columns aren't used in the transformer pipeline nor
+        for model training.
+
+        """
+        return self._config.ignore
+
+    @ignore.setter
+    def ignore(self, value: ColumnSelector | None):
+        if value is not None:
+            self._config.ignore = tuple(self.branch._get_columns(value, include_target=False))
+        else:
+            self._config.ignore = ()
+
     @property
     def missing(self) -> list[Any]:
         """Values that are considered "missing".
@@ -1138,23 +1156,27 @@ def _add_transformer(
         # Add BaseTransformer params to the estimator if left to default
         transformer_c = self._inherit(transformer_c)
 
-        # Transformers remember the train_only and cols parameters
         if not hasattr(transformer_c, "_train_only"):
             transformer_c._train_only = train_only
+
         if columns is not None:
-            inc = self.branch._get_columns(columns)
-            fxs_in_inc = any(c in self.features for c in inc)
-            target_in_inc = any(c in lst(self.target) for c in inc)
-            if fxs_in_inc and target_in_inc:
+            cols = self.branch._get_columns(columns)
+        else:
+            cols = list(self.branch.features)
+
+        # Columns in self.ignore are not transformed
+        if self.ignore:
+            cols = [c for c in cols if c not in self.ignore]
+
+        if cols != list(self.branch.features):
+            if any(c in self.features for c in cols) and any(c in lst(self.target) for c in cols):
                 self._log(
                     "Features and target columns passed to transformer "
                     f"{transformer_c.__class__.__name__}. Either select features or "
                     "the target column, not both at the same time. The transformation "
-                    "of the target column will be ignored.",
-                    1,
-                    severity="warning",
+                    "of the target column is ignored.", 1, severity="warning",
                 )
-            transformer_c._cols = inc
+            transformer_c._cols = cols
 
         # Add custom cloning method to keep internal attrs
         transformer_c.__class__.__sklearn_clone__ = TransformerMixin.__sklearn_clone__
@@ -2052,8 +2074,7 @@ def _run(self, trainer: BaseRunner):
                 self._delete_models(model.name)
                 self._log(
                     f"Consecutive runs of model {model.name}. "
-                    "The former model has been overwritten.",
-                    1,
+                    "The former model has been overwritten.", 1,
                 )
 
         self._models.extend(trainer._models)