diff --git a/atom/atom.py b/atom/atom.py index b8bff81b7..21897812f 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -49,7 +49,7 @@ Transformer, __version__, check_dependency, check_is_fitted, check_scaling, composed, crash, custom_transform, fit_one, flt, get_cols, get_custom_scorer, has_task, infer_task, is_multioutput, is_sparse, lst, - method_to_log, sign, variable_return, + method_to_log, sign, variable_return, to_pyarrow ) @@ -843,8 +843,13 @@ def get_data(new_t: str) -> SERIES: "float": [(x.name, np.finfo(x.type).min, np.finfo(x.type).max) for x in t3], } - # Convert selected columns to the best nullable dtype - data = self.dataset[self.branch._get_columns(columns)] # TODO: .convert_dtypes() + data = self.dataset[self.branch._get_columns(columns)] + + # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes + data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()}) + + # Convert to the best nullable dtype + data = data.convert_dtypes() for name, column in data.items(): if pd.api.types.is_sparse(column): @@ -852,13 +857,6 @@ def get_data(new_t: str) -> SERIES: else: old_t = column.dtype - # TODO: Finish shrink for pyarrow - if "pyarrow" in old_t.name: - column = column.astype(column.to_numpy().dtype) - - # TODO: Finish shrink for pyarrow - column = column.convert_dtypes() - if old_t.name.startswith("string"): if str2cat and column.nunique() <= int(len(column) * 0.3): self.branch._data[name] = get_data("category") @@ -886,21 +884,16 @@ def get_data(new_t: str) -> SERIES: get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max() ) - # TODO: Finish shrink for pyarrow - from pandas.core.dtypes.cast import convert_dtypes - print(self.dtypes) - self.branch.dataset = self.branch.dataset.astype( - { - name: convert_dtypes(column, dtype_backend="pyarrow") - for name, column in data.items() - } - ) + if self.engine["data"] == "pyarrow": + self.branch.dataset = self.branch.dataset.astype( + {name: to_pyarrow(col) for name, col in self.branch._data.items()} + ) self.log("The column dtypes are successfully converted.", 1) @composed(crash, method_to_log) def stats(self, _vb: INT = -2, /): - """Print basic information about the dataset. + """Display basic information about the dataset. Parameters ---------- diff --git a/atom/basemodel.py b/atom/basemodel.py index c2b720cf5..c1f7632a8 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -285,7 +285,7 @@ def _gpu(self) -> bool: def _est_class(self) -> Predictor: """Return the estimator's class (not instance).""" try: - module = import_module(f"{self.engine['models']}.{self._module}") + module = import_module(f"{self.engine['estimator']}.{self._module}") cls = self._estimators.get(self.goal, self._estimators.get("reg")) except (ModuleNotFoundError, AttributeError): if "sklearn" in self.supports_engines: diff --git a/atom/basetransformer.py b/atom/basetransformer.py index a2acf9186..72a14725e 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -124,7 +124,7 @@ def engine(self, value: dict | None): elif "data" not in value and "estimator" not in value: raise ValueError( f"Invalid value for the engine parameter, got {value}. " - "The value should be a dict with keys 'data' and/or 'models'." + "The value should be a dict with keys 'data' and/or 'estimator'." ) if data := value.get("data"): @@ -397,7 +397,7 @@ def _get_est_class(self, name: str, module: str) -> Predictor: """ try: - return getattr(import_module(f"{self.engine['models']}.{module}"), name) + return getattr(import_module(f"{self.engine['estimator']}.{module}"), name) except (ModuleNotFoundError, AttributeError): return getattr(import_module(f"sklearn.{module}"), name) diff --git a/atom/models.py b/atom/models.py index 2647e6d99..3fb5746fd 100644 --- a/atom/models.py +++ b/atom/models.py @@ -2983,10 +2983,11 @@ def _get_distributions(self) -> CustomDict: solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]), ) - if self.engine["estimator"] == "sklearnex": - dist.pop("solver") # Only supports 'auto' - elif self.engine["estimator"] == "cuml": - dist["solver"] = Cat(["eig", "svd", "cd"]) + if self.goal == "reg": + if self.engine["estimator"] == "sklearnex": + dist.pop("solver") # Only supports 'auto' + elif self.engine["estimator"] == "cuml": + dist["solver"] = Cat(["eig", "svd", "cd"]) return dist diff --git a/atom/utils.py b/atom/utils.py index 34d83813d..705e12755 100644 --- a/atom/utils.py +++ b/atom/utils.py @@ -41,7 +41,6 @@ from optuna.study import Study from optuna.trial import FrozenTrial from pandas.api.types import is_numeric_dtype -from pandas.core.dtypes.cast import convert_dtypes from shap import Explainer, Explanation from sklearn.metrics import ( confusion_matrix, get_scorer, get_scorer_names, make_scorer, @@ -1744,6 +1743,32 @@ def n_cols(data: FEATURES | TARGET | None) -> int: return array.ndim # Can be 0 when input is a dict +def to_pyarrow(column: SERIES, inverse: bool = False) -> str: + """Get the pyarrow dtype corresponding to a series. + + Parameters + ---------- + column: series + Column to get the dtype from. If it already has a pyarrow + dtype, return original dtype. + + inverse: bool, default=False + Whether to convert to pyarrow or back from pyarrow. + + Returns + ------- + str + Name of the converted dtype. + + """ + if not inverse and not column.dtype.name.endswith("[pyarrow]"): + return f"{column.dtype.name}[pyarrow]" + elif inverse and column.dtype.name.endswith("[pyarrow]"): + return column.dtype.name[:-9] + + return column.dtype.name + + def to_df( data: FEATURES | None, index: SEQUENCE | None = None, @@ -1791,13 +1816,8 @@ def to_df( if dtype is not None: data = data.astype(dtype) - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data): - data = data.astype( - { - name: convert_dtypes(column, dtype_backend="pyarrow") - for name, column in data.items() - } - ) + if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": + data = data.astype({name: to_pyarrow(col) for name, col in data.items()}) return data @@ -1844,8 +1864,8 @@ def to_series( dtype=dtype, ) - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data): - data = data.astype(convert_dtypes(data, dtype_backend="pyarrow")) + if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": + data = data.astype(to_pyarrow(data)) return data @@ -2050,7 +2070,7 @@ def get_custom_scorer(metric: str | Callable | Scorer) -> Scorer: scorer = make_scorer(score_func=metric) # If no name was assigned, use the name of the function - if not hasattr(scorer, name): + if not hasattr(scorer, "name"): scorer.name = scorer._score_func.__name__ return scorer diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md index 59a1b8468..7fe87f305 100644 --- a/docs_sources/user_guide/accelerating.md +++ b/docs_sources/user_guide/accelerating.md @@ -8,7 +8,7 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm !!! warning The pyarrow backend doesn't work for [sparse datasets][]. If the - dataset has any sparse columns, the type conversion is skipped silently. + dataset has any sparse columns, an exception is raised. [modin](https://modin.readthedocs.io/en/stable/), a multi-threading, drop-in replacement for pandas, that uses Ray as backend. @@ -18,8 +18,20 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm ## Estimator acceleration -Only transformers and predictors are converted to the -Metrics are not accelerated, to use a metric from cuML, use atom.rtun(metric=cuml_accuracy)... +Only transformers and predictors are converted to the requested engine. Metrics +are not accelerated, to use a metric from cuML, insert it directly in the +[`run`][atomclassifier-run] method: + +```python +from atom import ATOMClassifier +from cuml.metrics import accuracy_score +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=100, random_state=1) + +atom = ATOMClassifier(X, y, engine={"estimator": "cuml"}, verbose=2) +atom.run("LR", metric=accuracy_score) +``` !!! warning diff --git a/tests/test_atom.py b/tests/test_atom.py index 19a51c7f7..a4936a359 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -375,19 +375,22 @@ def test_save_data(): def test_shrink_dtypes_excluded(): """Assert that some dtypes are excluded from changing.""" - atom = ATOMClassifier(X10_str2, y10, random_state=1) - assert atom.dtypes[3].name == "bool" + X = X_bin.copy() + X["date"] = pd.date_range(start="1/1/2018", periods=len(X)) + + atom = ATOMClassifier(X, y_bin, random_state=1) + assert atom.dtypes[-2].name == "datetime64[ns]" atom.shrink() - assert atom.dtypes[3].name == "bool" + assert atom.dtypes[-2].name == "datetime64[ns]" # Unchanged -def test_shrink_obj2cat(): - """Assert that the obj2cat parameter works as intended.""" +def test_shrink_str2cat(): + """Assert that the str2cat parameter works as intended.""" atom = ATOMClassifier(X10_str2, y10, random_state=1) - atom.shrink(obj2cat=False) - assert atom.dtypes[2].name == "object" + atom.shrink(str2cat=False) + assert atom.dtypes[2].name == "string" - atom.shrink() + atom.shrink(str2cat=True) assert atom.dtypes[2].name == "category" @@ -395,12 +398,12 @@ def test_shrink_int2uint(): """Assert that the int2uint parameter works as intended.""" atom = ATOMClassifier(X10_str2, y10, random_state=1) assert atom.dtypes[0].name == "int64" - atom.shrink() - assert atom.dtypes[0].name == "int8" - assert atom.dtypes[0].name == "int8" + atom.shrink(int2uint=False) + assert atom.dtypes[0].name == "Int8" + atom.shrink(int2uint=True) - assert atom.dtypes[0].name == "uint8" + assert atom.dtypes[0].name == "UInt8" def test_shrink_sparse_arrays(): @@ -408,15 +411,15 @@ def test_shrink_sparse_arrays(): atom = ATOMClassifier(X_sparse, y10, random_state=1) assert atom.dtypes[0].name == "Sparse[int64, 0]" atom.shrink() - assert atom.dtypes[0].name == "Sparse[int8, 0]" + assert atom.dtypes[0].name == "Sparse[Int8, 0]" def test_shrink_dtypes_unchanged(): """Assert that optimal dtypes are left unchanged.""" - atom = ATOMClassifier(X_bin.astype("float32"), y_bin, random_state=1) - assert atom.dtypes[3].name == "float32" + atom = ATOMClassifier(X_bin.astype("Float32"), y_bin, random_state=1) + assert atom.dtypes[3].name == "Float32" atom.shrink() - assert atom.dtypes[3].name == "float32" + assert atom.dtypes[3].name == "Float32" def test_shrink_dense2sparse(): @@ -424,17 +427,25 @@ def test_shrink_dense2sparse(): atom = ATOMClassifier(X_bin, y_bin, random_state=1) assert atom.dtypes[0].name == "float64" atom.shrink(dense2sparse=True) - assert atom.dtypes[0].name.startswith("Sparse[float32") + assert atom.dtypes[0].name.startswith("Sparse[Float32") + + +def test_shrink_pyarrow(): + """Assert that it works with the pyarrow data backend.""" + atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1) + assert atom.dtypes[0].name == "double[pyarrow]" + atom.shrink() + assert atom.dtypes[0].name == "float[pyarrow]" def test_shrink_exclude_columns(): """Assert that columns can be excluded.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) assert atom.dtypes[0].name == "float64" - assert atom.dtypes[-1].name != "int8" + assert atom.dtypes[-1].name != "Int8" atom.shrink(columns=-1) assert atom.dtypes[0].name == "float64" - assert atom.dtypes[-1].name == "int8" + assert atom.dtypes[-1].name == "Int8" def test_stats_mixed_sparse_dense(): diff --git a/tests/test_models.py b/tests/test_models.py index 44d4ebc4b..59689b80b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -126,7 +126,7 @@ def test_models_sklearnex_regression(): ) -@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])}) +@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])}) def test_models_cuml_classification(): """Assert that all classification models can be called with cuml.""" atom = ATOMClassifier(X_bin, y_bin, engine={"estimator": "cuml"}, random_state=1) @@ -149,7 +149,7 @@ def test_models_cuml_classification(): ) -@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])}) +@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])}) def test_models_cuml_regression(): """Assert that all regression models can be called with cuml.""" atom = ATOMRegressor(X_reg, y_reg, engine={"estimator": "cuml"}, random_state=1) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4e3fec2fa..88bc83e7e 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -188,7 +188,7 @@ def test_hashing(): assert "hash1" in X -@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])}) +@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])}) @patch.dict("sys.modules", {"cuml.feature_extraction.text": MagicMock()}) def test_gpu(): """Assert that the gpu implementation calls the get method of matrix."""