From 65a02f6ef223390f4424767376d902f18617c1be Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 3 Oct 2024 15:47:24 -0500 Subject: [PATCH] Fix train_test_split for string columns (#6088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5834 Before the fix, this was an issue: ```python import cudf from cuml.model_selection import train_test_split SEED = 1 df_a = cudf.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9], 'c': ['High', 'Low', 'High', 'High', 'Low'] }) target = cudf.Series([1, 1, 1, 0, 0]) # breakpoint() all_numeric = all(cudf.api.types.is_numeric_dtype(df_a[col]) for col in df_a.columns) print(all_numeric) tr, te, ytr, yte = train_test_split(X=df_a, y=target, test_size=0.3, random_state=SEED, stratify=target) print(tr) `` would result in multiple errors of the type ```python File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/utils/performance_tracking.py", line 51, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/frame.py", line 358, in _get_columns_by_label return self._from_data_like_self(self._data.select_by_label(labels)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py", line 401, in select_by_label return self._select_by_label_grouped(key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py", line 563, in _select_by_label_grouped result = self._grouped_data[key] ~~~~~~~~~~~~~~~~~~^^^^^ KeyError: '__cuda_array_interface__' ``` After the fix, train_test_split works for cuDF string columns: ```python (rapids) coder ➜ ~ $ python cudfstr.py a b c 3 3 8 High 4 4 9 Low 2 2 7 High 1 1 6 Low ``` Need to add a test and probably do a small fix for cudf.pandas. There is some redundancy in the code, which can be cleaned as a follow up for a later release to get this is for 24.10. --- python/cuml/cuml/cluster/kmeans.pyx | 11 ++- python/cuml/cuml/internals/array.py | 15 +-- python/cuml/cuml/model_selection/_split.py | 106 +++++++++++---------- 3 files changed, 74 insertions(+), 58 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index e8ab51e4dd..3d6be3abf2 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import rmm = gpu_only_import('rmm') +from cuml.internals.safe_imports import safe_import_from, return_false import typing IF GPUBUILD == 1: @@ -46,7 +47,10 @@ from cuml.common import input_to_cuml_array from cuml.internals.api_decorators import device_interop_preparation from cuml.internals.api_decorators import enable_device_interop -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +# from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +_openmp_effective_n_threads = safe_import_from( + "sklearn.utils._openmp_helpers", "_openmp_effective_n_threads", alt=return_false +) class KMeans(UniversalBase, @@ -235,7 +239,10 @@ class KMeans(UniversalBase, self.cluster_centers_ = None # For sklearn interoperability - self._n_threads = _openmp_effective_n_threads() + if _openmp_effective_n_threads(): + self._n_threads = _openmp_effective_n_threads() + else: + self._n_threads = 1 # cuPy does not allow comparing with string. See issue #2372 init_str = init if isinstance(init, str) else None diff --git a/python/cuml/cuml/internals/array.py b/python/cuml/cuml/internals/array.py index e61d84ab83..c30d609563 100644 --- a/python/cuml/cuml/internals/array.py +++ b/python/cuml/cuml/internals/array.py @@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"): return arr.order except AttributeError: pass - try: - array_interface = arr.__cuda_array_interface__ - except AttributeError: - try: - array_interface = arr.__array_interface__ - except AttributeError: - return array_to_memory_order(CumlArray.from_input(arr, order="K")) + array_interface = getattr( + arr, + "__cuda_array_interface__", + getattr(arr, "__array_interface__", False), + ) + if not array_interface: + return array_to_memory_order(CumlArray.from_input(arr, order="K")) + strides = array_interface.get("strides", None) if strides is None: try: diff --git a/python/cuml/cuml/model_selection/_split.py b/python/cuml/cuml/model_selection/_split.py index 0727f82c82..227f0eb297 100644 --- a/python/cuml/cuml/model_selection/_split.py +++ b/python/cuml/cuml/model_selection/_split.py @@ -265,8 +265,18 @@ def train_test_split( string" ) - x_order = array_to_memory_order(X) - X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + all_numeric = True + if isinstance(X, cudf.DataFrame): + all_numeric = all( + cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns + ) + + if all_numeric: + x_order = array_to_memory_order(X) + X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + else: + x_order = "F" + X_arr, X_row = X, X.shape[0] if y is not None: y_order = array_to_memory_order(y) y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order) @@ -363,55 +373,53 @@ def train_test_split( train_indices = range(0, train_size) test_indices = range(-1 * test_size, 0) - # Gather from indices - X_train = X_arr[train_indices] - X_test = X_arr[test_indices] - if y is not None: - y_train = y_arr[train_indices] - y_test = y_arr[test_indices] - - # Coerce output to original input type - if ty := determine_df_obj_type(X): - x_type = ty - else: - x_type = determine_array_type(X) - - if ty := determine_df_obj_type(y): - y_type = ty - else: - y_type = determine_array_type(y) - - if x_type in ("series", "dataframe"): - X_train = output_to_df_obj_like(X_train, X, x_type) - X_test = output_to_df_obj_like(X_test, X, x_type) - - if determine_array_type(X.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() + if all_numeric: + # Gather from indices + X_train = X_arr[train_indices] + X_test = X_arr[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] + + # Coerce output to original input type + x_type = determine_df_obj_type(X) or determine_array_type(X) + if y is not None: + y_type = determine_df_obj_type(y) or determine_array_type(y) + + def _process_df_objs( + df, df_type, df_train, df_test, train_indices, test_indices + ): + if df_type in {"series", "dataframe"}: + df_train = output_to_df_obj_like(df_train, df, df_type) + df_test = output_to_df_obj_like(df_test, df, df_type) + + if determine_array_type(df.index) == "pandas": + if isinstance(train_indices, cp.ndarray): + train_indices = train_indices.get() + if isinstance(test_indices, cp.ndarray): + test_indices = test_indices.get() + + df_train.index = df.index[train_indices] + df_test.index = df.index[test_indices] + else: + df_train = df_train.to_output(df_type) + df_test = df_test.to_output(df_type) + return df_train, df_test + + X_train, X_test = _process_df_objs( + X, x_type, X_train, X_test, train_indices, test_indices + ) + if y is not None: + y_train, y_test = _process_df_objs( + y, y_type, y_train, y_test, train_indices, test_indices + ) - X_train.index = X.index[train_indices] - X_test.index = X.index[test_indices] else: - X_train = X_train.to_output(x_type) - X_test = X_test.to_output(x_type) - - if y_type in ("series", "dataframe"): - y_train = output_to_df_obj_like(y_train, y, y_type) - y_test = output_to_df_obj_like(y_test, y, y_type) - - if determine_array_type(y.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() - - y_train.index = y.index[train_indices] - y_test.index = y.index[test_indices] - elif y_type is not None: - y_train = y_train.to_output(y_type) - y_test = y_test.to_output(y_type) + X_train = X_arr.iloc[train_indices] + X_test = X_arr.iloc[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] if y is not None: return X_train, X_test, y_train, y_test