Skip to content

Commit

Permalink
fix cuml 2
Browse files Browse the repository at this point in the history
  • Loading branch information
tvdboom committed Aug 28, 2023
1 parent 91090b4 commit b38e87b
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 63 deletions.
33 changes: 13 additions & 20 deletions atom/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
Transformer, __version__, check_dependency, check_is_fitted, check_scaling,
composed, crash, custom_transform, fit_one, flt, get_cols,
get_custom_scorer, has_task, infer_task, is_multioutput, is_sparse, lst,
method_to_log, sign, variable_return,
method_to_log, sign, variable_return, to_pyarrow
)


Expand Down Expand Up @@ -843,22 +843,20 @@ def get_data(new_t: str) -> SERIES:
"float": [(x.name, np.finfo(x.type).min, np.finfo(x.type).max) for x in t3],
}

# Convert selected columns to the best nullable dtype
data = self.dataset[self.branch._get_columns(columns)] # TODO: .convert_dtypes()
data = self.dataset[self.branch._get_columns(columns)]

Check notice on line 846 in atom/atom.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _get_columns of a class

# Convert back since convert_dtypes doesn't work properly for pyarrow dtypes
data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()})

# Convert to the best nullable dtype
data = data.convert_dtypes()

for name, column in data.items():
if pd.api.types.is_sparse(column):
old_t = column.dtype.subtype
else:
old_t = column.dtype

# TODO: Finish shrink for pyarrow
if "pyarrow" in old_t.name:
column = column.astype(column.to_numpy().dtype)

# TODO: Finish shrink for pyarrow
column = column.convert_dtypes()

if old_t.name.startswith("string"):
if str2cat and column.nunique() <= int(len(column) * 0.3):
self.branch._data[name] = get_data("category")

Check notice on line 862 in atom/atom.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _data of a class
Expand Down Expand Up @@ -886,21 +884,16 @@ def get_data(new_t: str) -> SERIES:
get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max()
)

# TODO: Finish shrink for pyarrow
from pandas.core.dtypes.cast import convert_dtypes
print(self.dtypes)
self.branch.dataset = self.branch.dataset.astype(
{
name: convert_dtypes(column, dtype_backend="pyarrow")
for name, column in data.items()
}
)
if self.engine["data"] == "pyarrow":
self.branch.dataset = self.branch.dataset.astype(
{name: to_pyarrow(col) for name, col in self.branch._data.items()}

Check notice on line 889 in atom/atom.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _data of a class
)

self.log("The column dtypes are successfully converted.", 1)

@composed(crash, method_to_log)
def stats(self, _vb: INT = -2, /):
"""Print basic information about the dataset.
"""Display basic information about the dataset.
Parameters
----------
Expand Down
2 changes: 1 addition & 1 deletion atom/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def _gpu(self) -> bool:
def _est_class(self) -> Predictor:
"""Return the estimator's class (not instance)."""
try:
module = import_module(f"{self.engine['models']}.{self._module}")
module = import_module(f"{self.engine['estimator']}.{self._module}")
cls = self._estimators.get(self.goal, self._estimators.get("reg"))
except (ModuleNotFoundError, AttributeError):
if "sklearn" in self.supports_engines:
Expand Down
4 changes: 2 additions & 2 deletions atom/basetransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def engine(self, value: dict | None):
elif "data" not in value and "estimator" not in value:
raise ValueError(
f"Invalid value for the engine parameter, got {value}. "
"The value should be a dict with keys 'data' and/or 'models'."
"The value should be a dict with keys 'data' and/or 'estimator'."
)

if data := value.get("data"):
Expand Down Expand Up @@ -397,7 +397,7 @@ def _get_est_class(self, name: str, module: str) -> Predictor:
"""
try:
return getattr(import_module(f"{self.engine['models']}.{module}"), name)
return getattr(import_module(f"{self.engine['estimator']}.{module}"), name)
except (ModuleNotFoundError, AttributeError):
return getattr(import_module(f"sklearn.{module}"), name)

Expand Down
9 changes: 5 additions & 4 deletions atom/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2983,10 +2983,11 @@ def _get_distributions(self) -> CustomDict:
solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
)

if self.engine["estimator"] == "sklearnex":
dist.pop("solver") # Only supports 'auto'
elif self.engine["estimator"] == "cuml":
dist["solver"] = Cat(["eig", "svd", "cd"])
if self.goal == "reg":
if self.engine["estimator"] == "sklearnex":
dist.pop("solver") # Only supports 'auto'
elif self.engine["estimator"] == "cuml":
dist["solver"] = Cat(["eig", "svd", "cd"])

return dist

Expand Down
42 changes: 31 additions & 11 deletions atom/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from optuna.study import Study
from optuna.trial import FrozenTrial
from pandas.api.types import is_numeric_dtype
from pandas.core.dtypes.cast import convert_dtypes
from shap import Explainer, Explanation
from sklearn.metrics import (
confusion_matrix, get_scorer, get_scorer_names, make_scorer,
Expand Down Expand Up @@ -1744,6 +1743,32 @@ def n_cols(data: FEATURES | TARGET | None) -> int:
return array.ndim # Can be 0 when input is a dict


def to_pyarrow(column: SERIES, inverse: bool = False) -> str:
"""Get the pyarrow dtype corresponding to a series.
Parameters
----------
column: series
Column to get the dtype from. If it already has a pyarrow
dtype, return original dtype.
inverse: bool, default=False
Whether to convert to pyarrow or back from pyarrow.
Returns
-------
str
Name of the converted dtype.
"""
if not inverse and not column.dtype.name.endswith("[pyarrow]"):
return f"{column.dtype.name}[pyarrow]"
elif inverse and column.dtype.name.endswith("[pyarrow]"):
return column.dtype.name[:-9]

return column.dtype.name


def to_df(
data: FEATURES | None,
index: SEQUENCE | None = None,
Expand Down Expand Up @@ -1791,13 +1816,8 @@ def to_df(
if dtype is not None:
data = data.astype(dtype)

if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data):
data = data.astype(
{
name: convert_dtypes(column, dtype_backend="pyarrow")
for name, column in data.items()
}
)
if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
data = data.astype({name: to_pyarrow(col) for name, col in data.items()})

return data

Expand Down Expand Up @@ -1844,8 +1864,8 @@ def to_series(
dtype=dtype,
)

if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow" and not is_sparse(data):
data = data.astype(convert_dtypes(data, dtype_backend="pyarrow"))
if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
data = data.astype(to_pyarrow(data))

return data

Expand Down Expand Up @@ -2050,7 +2070,7 @@ def get_custom_scorer(metric: str | Callable | Scorer) -> Scorer:
scorer = make_scorer(score_func=metric)

# If no name was assigned, use the name of the function
if not hasattr(scorer, name):
if not hasattr(scorer, "name"):
scorer.name = scorer._score_func.__name__

Check notice on line 2074 in atom/utils.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _score_func of a class

return scorer
Expand Down
18 changes: 15 additions & 3 deletions docs_sources/user_guide/accelerating.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm

!!! warning
The pyarrow backend doesn't work for [sparse datasets][]. If the
dataset has any sparse columns, the type conversion is skipped silently.
dataset has any sparse columns, an exception is raised.

[modin](https://modin.readthedocs.io/en/stable/), a multi-threading, drop-in replacement for pandas, that uses Ray as backend.

Expand All @@ -18,8 +18,20 @@ its functionalities [here](https://pandas.pydata.org/docs/user_guide/pyarrow.htm

## Estimator acceleration

Only transformers and predictors are converted to the
Metrics are not accelerated, to use a metric from cuML, use atom.rtun(metric=cuml_accuracy)...
Only transformers and predictors are converted to the requested engine. Metrics
are not accelerated, to use a metric from cuML, insert it directly in the
[`run`][atomclassifier-run] method:

```python
from atom import ATOMClassifier
from cuml.metrics import accuracy_score
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, random_state=1)

atom = ATOMClassifier(X, y, engine={"estimator": "cuml"}, verbose=2)
atom.run("LR", metric=accuracy_score)
```


!!! warning
Expand Down
49 changes: 30 additions & 19 deletions tests/test_atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,66 +375,77 @@ def test_save_data():

def test_shrink_dtypes_excluded():
"""Assert that some dtypes are excluded from changing."""
atom = ATOMClassifier(X10_str2, y10, random_state=1)
assert atom.dtypes[3].name == "bool"
X = X_bin.copy()
X["date"] = pd.date_range(start="1/1/2018", periods=len(X))

atom = ATOMClassifier(X, y_bin, random_state=1)
assert atom.dtypes[-2].name == "datetime64[ns]"
atom.shrink()
assert atom.dtypes[3].name == "bool"
assert atom.dtypes[-2].name == "datetime64[ns]" # Unchanged


def test_shrink_obj2cat():
"""Assert that the obj2cat parameter works as intended."""
def test_shrink_str2cat():
"""Assert that the str2cat parameter works as intended."""
atom = ATOMClassifier(X10_str2, y10, random_state=1)
atom.shrink(obj2cat=False)
assert atom.dtypes[2].name == "object"
atom.shrink(str2cat=False)
assert atom.dtypes[2].name == "string"

atom.shrink()
atom.shrink(str2cat=True)
assert atom.dtypes[2].name == "category"


def test_shrink_int2uint():
"""Assert that the int2uint parameter works as intended."""
atom = ATOMClassifier(X10_str2, y10, random_state=1)
assert atom.dtypes[0].name == "int64"
atom.shrink()
assert atom.dtypes[0].name == "int8"

assert atom.dtypes[0].name == "int8"
atom.shrink(int2uint=False)
assert atom.dtypes[0].name == "Int8"

atom.shrink(int2uint=True)
assert atom.dtypes[0].name == "uint8"
assert atom.dtypes[0].name == "UInt8"


def test_shrink_sparse_arrays():
"""Assert that sparse arrays are also transformed."""
atom = ATOMClassifier(X_sparse, y10, random_state=1)
assert atom.dtypes[0].name == "Sparse[int64, 0]"
atom.shrink()
assert atom.dtypes[0].name == "Sparse[int8, 0]"
assert atom.dtypes[0].name == "Sparse[Int8, 0]"


def test_shrink_dtypes_unchanged():
"""Assert that optimal dtypes are left unchanged."""
atom = ATOMClassifier(X_bin.astype("float32"), y_bin, random_state=1)
assert atom.dtypes[3].name == "float32"
atom = ATOMClassifier(X_bin.astype("Float32"), y_bin, random_state=1)
assert atom.dtypes[3].name == "Float32"
atom.shrink()
assert atom.dtypes[3].name == "float32"
assert atom.dtypes[3].name == "Float32"


def test_shrink_dense2sparse():
"""Assert that the dataset can be converted to sparse."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
assert atom.dtypes[0].name == "float64"
atom.shrink(dense2sparse=True)
assert atom.dtypes[0].name.startswith("Sparse[float32")
assert atom.dtypes[0].name.startswith("Sparse[Float32")


def test_shrink_pyarrow():
"""Assert that it works with the pyarrow data backend."""
atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1)
assert atom.dtypes[0].name == "double[pyarrow]"
atom.shrink()
assert atom.dtypes[0].name == "float[pyarrow]"


def test_shrink_exclude_columns():
"""Assert that columns can be excluded."""
atom = ATOMClassifier(X_bin, y_bin, random_state=1)
assert atom.dtypes[0].name == "float64"
assert atom.dtypes[-1].name != "int8"
assert atom.dtypes[-1].name != "Int8"
atom.shrink(columns=-1)
assert atom.dtypes[0].name == "float64"
assert atom.dtypes[-1].name == "int8"
assert atom.dtypes[-1].name == "Int8"


def test_stats_mixed_sparse_dense():
Expand Down
4 changes: 2 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_models_sklearnex_regression():
)


@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
def test_models_cuml_classification():
"""Assert that all classification models can be called with cuml."""
atom = ATOMClassifier(X_bin, y_bin, engine={"estimator": "cuml"}, random_state=1)
Expand All @@ -149,7 +149,7 @@ def test_models_cuml_classification():
)


@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
def test_models_cuml_regression():
"""Assert that all regression models can be called with cuml."""
atom = ATOMRegressor(X_reg, y_reg, engine={"estimator": "cuml"}, random_state=1)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def test_hashing():
assert "hash1" in X


@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__"])})
@patch.dict("sys.modules", {"cuml": MagicMock(spec=["__spec__", "internals"])})
@patch.dict("sys.modules", {"cuml.feature_extraction.text": MagicMock()})
def test_gpu():
"""Assert that the gpu implementation calls the get method of matrix."""
Expand Down

0 comments on commit b38e87b

Please sign in to comment.