fix cuml 2

tvdboom · Aug 28, 2023 · b38e87b · b38e87b
1 parent 91090b4
commit b38e87b
Show file tree

Hide file tree

Showing 9 changed files with 100 additions and 63 deletions.
diff --git a/atom/atom.py b/atom/atom.py
@@ -49,7 +49,7 @@
     Transformer, __version__, check_dependency, check_is_fitted, check_scaling,
     composed, crash, custom_transform, fit_one, flt, get_cols,
     get_custom_scorer, has_task, infer_task, is_multioutput, is_sparse, lst,
-    method_to_log, sign, variable_return,
+    method_to_log, sign, variable_return, to_pyarrow
 )
 
 
@@ -843,22 +843,20 @@ def get_data(new_t: str) -> SERIES:
             "float": [(x.name, np.finfo(x.type).min, np.finfo(x.type).max) for x in t3],
         }
 
-        # Convert selected columns to the best nullable dtype
-        data = self.dataset[self.branch._get_columns(columns)]  # TODO: .convert_dtypes()
+        data = self.dataset[self.branch._get_columns(columns)]
+
+        # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes
+        data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()})
+
+        # Convert to the best nullable dtype
+        data = data.convert_dtypes()
 
         for name, column in data.items():
             if pd.api.types.is_sparse(column):
                 old_t = column.dtype.subtype
             else:
                 old_t = column.dtype
 
-            # TODO: Finish shrink for pyarrow
-            if "pyarrow" in old_t.name:
-                column = column.astype(column.to_numpy().dtype)
-
-            # TODO: Finish shrink for pyarrow
-            column = column.convert_dtypes()
-
             if old_t.name.startswith("string"):
                 if str2cat and column.nunique() <= int(len(column) * 0.3):
                     self.branch._data[name] = get_data("category")
@@ -886,21 +884,16 @@ def get_data(new_t: str) -> SERIES:
                 get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max()
             )
 
-        # TODO: Finish shrink for pyarrow
-        from pandas.core.dtypes.cast import convert_dtypes
-        print(self.dtypes)
-        self.branch.dataset = self.branch.dataset.astype(
-            {
-                name: convert_dtypes(column, dtype_backend="pyarrow")
-                for name, column in data.items()
-            }
-        )
+        if self.engine["data"] == "pyarrow":
+            self.branch.dataset = self.branch.dataset.astype(
+                {name: to_pyarrow(col) for name, col in self.branch._data.items()}
+            )
 
         self.log("The column dtypes are successfully converted.", 1)
 
     @composed(crash, method_to_log)
     def stats(self, _vb: INT = -2, /):
-        """Print basic information about the dataset.
+        """Display basic information about the dataset.
 
         Parameters
         ----------

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -285,7 +285,7 @@ def _gpu(self) -> bool:
     def _est_class(self) -> Predictor:
         """Return the estimator's class (not instance)."""
         try:
-            module = import_module(f"{self.engine['models']}.{self._module}")
+            module = import_module(f"{self.engine['estimator']}.{self._module}")
             cls = self._estimators.get(self.goal, self._estimators.get("reg"))
         except (ModuleNotFoundError, AttributeError):
             if "sklearn" in self.supports_engines:

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -124,7 +124,7 @@ def engine(self, value: dict | None):
         elif "data" not in value and "estimator" not in value:
             raise ValueError(
                 f"Invalid value for the engine parameter, got {value}. "
-                "The value should be a dict with keys 'data' and/or 'models'."
+                "The value should be a dict with keys 'data' and/or 'estimator'."
             )
 
         if data := value.get("data"):
@@ -397,7 +397,7 @@ def _get_est_class(self, name: str, module: str) -> Predictor:
 
         """
         try:
-            return getattr(import_module(f"{self.engine['models']}.{module}"), name)
+            return getattr(import_module(f"{self.engine['estimator']}.{module}"), name)
         except (ModuleNotFoundError, AttributeError):
             return getattr(import_module(f"sklearn.{module}"), name)
 

diff --git a/atom/models.py b/atom/models.py
@@ -2983,10 +2983,11 @@ def _get_distributions(self) -> CustomDict:
             solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
         )
 
-        if self.engine["estimator"] == "sklearnex":
-            dist.pop("solver")  # Only supports 'auto'
-        elif self.engine["estimator"] == "cuml":
-            dist["solver"] = Cat(["eig", "svd", "cd"])
+        if self.goal == "reg":
+            if self.engine["estimator"] == "sklearnex":
+                dist.pop("solver")  # Only supports 'auto'
+            elif self.engine["estimator"] == "cuml":
+                dist["solver"] = Cat(["eig", "svd", "cd"])
 
         return dist
 

diff --git a/atom/utils.py b/atom/utils.py
@@ -41,7 +41,6 @@
 from optuna.study import Study
 from optuna.trial import FrozenTrial
 from pandas.api.types import is_numeric_dtype
-from pandas.core.dtypes.cast import convert_dtypes
 from shap import Explainer, Explanation
 from sklearn.metrics import (
     confusion_matrix, get_scorer, get_scorer_names, make_scorer,
@@ -1744,6 +1743,32 @@ def n_cols(data: FEATURES | TARGET | None) -> int:
             return array.ndim  # Can be 0 when input is a dict
 
 
+def to_pyarrow(column: SERIES, inverse: bool = False) -> str:
+    """Get the pyarrow dtype corresponding to a series.
+
+    Parameters
+    ----------
+    column: series
+        Column to get the dtype from. If it already has a pyarrow
+        dtype, return original dtype.
+
+    inverse: bool, default=False
+        Whether to convert to pyarrow or back from pyarrow.
+
+    Returns
+    -------
+    str
+        Name of the converted dtype.
+
+    """
+    if not inverse and not column.dtype.name.endswith("[pyarrow]"):
+        return f"{column.dtype.name}[pyarrow]"
+    elif inverse and column.dtype.name.endswith("[pyarrow]"):
+        return column.dtype.name[:-9]
+
+    return column.dtype.name
+
+
 def to_df(
     data: FEATURES | None,
     index: SEQUENCE | None = None,
@@ -1791,13 +1816,8 @@ def to_df(
             if dtype is not None:
                 data = data.astype(dtype)
 
-        if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data):
-            data = data.astype(
-                {
-                    name: convert_dtypes(column, dtype_backend="pyarrow")
-                    for name, column in data.items()
-                }
-            )
+        if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
+            data = data.astype({name: to_pyarrow(col) for name, col in data.items()})
 
     return data
 
@@ -1844,8 +1864,8 @@ def to_series(
                     dtype=dtype,
                 )
 
-        if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data):
-            data = data.astype(convert_dtypes(data, dtype_backend="pyarrow"))
+        if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
+            data = data.astype(to_pyarrow(data))
 
     return data
 
@@ -2050,7 +2070,7 @@ def get_custom_scorer(metric: str | Callable | Scorer) -> Scorer:
         scorer = make_scorer(score_func=metric)
 
     # If no name was assigned, use the name of the function
-    if not hasattr(scorer, name):
+    if not hasattr(scorer, "name"):
         scorer.name = scorer._score_func.__name__
 
     return scorer

diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md
@@ -8,7 +8,7 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm
 
 !!! warning
     The pyarrow backend doesn't work for [sparse datasets][]. If the
-    dataset has any sparse columns, the type conversion is skipped silently.
+    dataset has any sparse columns, an exception is raised.
 
 [modin](https://modin.readthedocs.io/en/stable/), a multi-threading, drop-in replacement for pandas, that uses Ray as backend.
 
@@ -18,8 +18,20 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm
 
 ## Estimator acceleration
 
-Only transformers and predictors are converted to the 
-Metrics are not accelerated, to use a metric from cuML, use atom.rtun(metric=cuml_accuracy)...
+Only transformers and predictors are converted to the requested engine. Metrics
+are not accelerated, to use a metric from cuML, insert it directly in the
+[`run`][atomclassifier-run] method:
+
+```python
+from atom import ATOMClassifier
+from cuml.metrics import accuracy_score
+from sklearn.datasets import make_classification
+
+X, y = make_classification(n_samples=100, random_state=1)
+
+atom = ATOMClassifier(X, y, engine={"estimator": "cuml"}, verbose=2)
+atom.run("LR", metric=accuracy_score)
+```
 
 
 !!! warning

diff --git a/tests/test_atom.py b/tests/test_atom.py
@@ -375,66 +375,77 @@ def test_save_data():
 
 def test_shrink_dtypes_excluded():
     """Assert that some dtypes are excluded from changing."""
-    atom = ATOMClassifier(X10_str2, y10, random_state=1)
-    assert atom.dtypes[3].name == "bool"
+    X = X_bin.copy()
+    X["date"] = pd.date_range(start="1/1/2018", periods=len(X))
+
+    atom = ATOMClassifier(X, y_bin, random_state=1)
+    assert atom.dtypes[-2].name == "datetime64[ns]"
     atom.shrink()
-    assert atom.dtypes[3].name == "bool"
+    assert atom.dtypes[-2].name == "datetime64[ns]"  # Unchanged
 
 
-def test_shrink_obj2cat():
-    """Assert that the obj2cat parameter works as intended."""
+def test_shrink_str2cat():
+    """Assert that the str2cat parameter works as intended."""
     atom = ATOMClassifier(X10_str2, y10, random_state=1)
-    atom.shrink(obj2cat=False)
-    assert atom.dtypes[2].name == "object"
+    atom.shrink(str2cat=False)
+    assert atom.dtypes[2].name == "string"
 
-    atom.shrink()
+    atom.shrink(str2cat=True)
     assert atom.dtypes[2].name == "category"
 
 
 def test_shrink_int2uint():
     """Assert that the int2uint parameter works as intended."""
     atom = ATOMClassifier(X10_str2, y10, random_state=1)
     assert atom.dtypes[0].name == "int64"
-    atom.shrink()
-    assert atom.dtypes[0].name == "int8"
 
-    assert atom.dtypes[0].name == "int8"
+    atom.shrink(int2uint=False)
+    assert atom.dtypes[0].name == "Int8"
+
     atom.shrink(int2uint=True)
-    assert atom.dtypes[0].name == "uint8"
+    assert atom.dtypes[0].name == "UInt8"
 
 
 def test_shrink_sparse_arrays():
     """Assert that sparse arrays are also transformed."""
     atom = ATOMClassifier(X_sparse, y10, random_state=1)
     assert atom.dtypes[0].name == "Sparse[int64, 0]"
     atom.shrink()
-    assert atom.dtypes[0].name == "Sparse[int8, 0]"
+    assert atom.dtypes[0].name == "Sparse[Int8, 0]"
 
 
 def test_shrink_dtypes_unchanged():
     """Assert that optimal dtypes are left unchanged."""
-    atom = ATOMClassifier(X_bin.astype("float32"), y_bin, random_state=1)
-    assert atom.dtypes[3].name == "float32"
+    atom = ATOMClassifier(X_bin.astype("Float32"), y_bin, random_state=1)
+    assert atom.dtypes[3].name == "Float32"
     atom.shrink()
-    assert atom.dtypes[3].name == "float32"
+    assert atom.dtypes[3].name == "Float32"
 
 
 def test_shrink_dense2sparse():
     """Assert that the dataset can be converted to sparse."""
     atom = ATOMClassifier(X_bin, y_bin, random_state=1)
     assert atom.dtypes[0].name == "float64"
     atom.shrink(dense2sparse=True)
-    assert atom.dtypes[0].name.startswith("Sparse[float32")
+    assert atom.dtypes[0].name.startswith("Sparse[Float32")
+
+
+def test_shrink_pyarrow():
+    """Assert that it works with the pyarrow data backend."""
+    atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1)
+    assert atom.dtypes[0].name == "double[pyarrow]"
+    atom.shrink()
+    assert atom.dtypes[0].name == "float[pyarrow]"
 
 
 def test_shrink_exclude_columns():
     """Assert that columns can be excluded."""
     atom = ATOMClassifier(X_bin, y_bin, random_state=1)
     assert atom.dtypes[0].name == "float64"
-    assert atom.dtypes[-1].name != "int8"
+    assert atom.dtypes[-1].name != "Int8"
     atom.shrink(columns=-1)
     assert atom.dtypes[0].name == "float64"
-    assert atom.dtypes[-1].name == "int8"
+    assert atom.dtypes[-1].name == "Int8"
 
 
 def test_stats_mixed_sparse_dense():

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -126,7 +126,7 @@ def test_models_sklearnex_regression():
     )
 
 
-@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
+@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
 def test_models_cuml_classification():
     """Assert that all classification models can be called with cuml."""
     atom = ATOMClassifier(X_bin, y_bin, engine={"estimator": "cuml"}, random_state=1)
@@ -149,7 +149,7 @@ def test_models_cuml_classification():
     )
 
 
-@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
+@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
 def test_models_cuml_regression():
     """Assert that all regression models can be called with cuml."""
     atom = ATOMRegressor(X_reg, y_reg, engine={"estimator": "cuml"}, random_state=1)

diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -188,7 +188,7 @@ def test_hashing():
     assert "hash1" in X
 
 
-@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
+@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
 @patch.dict("sys.modules", {"cuml.feature_extraction.text": MagicMock()})
 def test_gpu():
     """Assert that the gpu implementation calls the get method of matrix."""