From 65a02f6ef223390f4424767376d902f18617c1be Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Thu, 3 Oct 2024 15:47:24 -0500
Subject: [PATCH] Fix train_test_split for string columns (#6088)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #5834

Before the fix, this was an issue:

```python
import cudf
from cuml.model_selection import train_test_split

SEED = 1
df_a = cudf.DataFrame({'a': [0, 1, 2, 3, 4],
                    'b': [5, 6, 7, 8, 9],
                    'c': ['High', 'Low', 'High', 'High', 'Low']
                   })
target = cudf.Series([1, 1, 1, 0, 0])

# breakpoint()
all_numeric = all(cudf.api.types.is_numeric_dtype(df_a[col]) for col in df_a.columns)
print(all_numeric)
tr, te, ytr, yte = train_test_split(X=df_a, y=target, test_size=0.3, random_state=SEED, stratify=target)

print(tr)
``

would result in multiple errors of the type

```python
File
"/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/utils/performance_tracking.py",
line 51, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
File
"/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/frame.py",
line 358, in _get_columns_by_label
    return self._from_data_like_self(self._data.select_by_label(labels))
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py",
line 401, in select_by_label
    return self._select_by_label_grouped(key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py",
line 563, in _select_by_label_grouped
    result = self._grouped_data[key]
             ~~~~~~~~~~~~~~~~~~^^^^^
KeyError: '__cuda_array_interface__'
```

After the fix, train_test_split works for cuDF string columns:

```python
(rapids) coder ➜ ~ $ python cudfstr.py
   a  b     c
3  3  8  High
4  4  9   Low
2  2  7  High
1  1  6   Low
```

Need to add a test and probably do a small fix for cudf.pandas.  There is some redundancy in the code, which can be cleaned as a follow up for a later release to get this is for 24.10.
---
 python/cuml/cuml/cluster/kmeans.pyx        |  11 ++-
 python/cuml/cuml/internals/array.py        |  15 +--
 python/cuml/cuml/model_selection/_split.py | 106 +++++++++++----------
 3 files changed, 74 insertions(+), 58 deletions(-)

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index e8ab51e4dd..3d6be3abf2 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 rmm = gpu_only_import('rmm')
+from cuml.internals.safe_imports import safe_import_from, return_false
 import typing
 
 IF GPUBUILD == 1:
@@ -46,7 +47,10 @@ from cuml.common import input_to_cuml_array
 from cuml.internals.api_decorators import device_interop_preparation
 from cuml.internals.api_decorators import enable_device_interop
 
-from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+# from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+_openmp_effective_n_threads = safe_import_from(
+    "sklearn.utils._openmp_helpers", "_openmp_effective_n_threads", alt=return_false
+)
 
 
 class KMeans(UniversalBase,
@@ -235,7 +239,10 @@ class KMeans(UniversalBase,
         self.cluster_centers_ = None
 
         # For sklearn interoperability
-        self._n_threads = _openmp_effective_n_threads()
+        if _openmp_effective_n_threads():
+            self._n_threads = _openmp_effective_n_threads()
+        else:
+            self._n_threads = 1
 
         # cuPy does not allow comparing with string. See issue #2372
         init_str = init if isinstance(init, str) else None
diff --git a/python/cuml/cuml/internals/array.py b/python/cuml/cuml/internals/array.py
index e61d84ab83..c30d609563 100644
--- a/python/cuml/cuml/internals/array.py
+++ b/python/cuml/cuml/internals/array.py
@@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"):
         return arr.order
     except AttributeError:
         pass
-    try:
-        array_interface = arr.__cuda_array_interface__
-    except AttributeError:
-        try:
-            array_interface = arr.__array_interface__
-        except AttributeError:
-            return array_to_memory_order(CumlArray.from_input(arr, order="K"))
+    array_interface = getattr(
+        arr,
+        "__cuda_array_interface__",
+        getattr(arr, "__array_interface__", False),
+    )
+    if not array_interface:
+        return array_to_memory_order(CumlArray.from_input(arr, order="K"))
+
     strides = array_interface.get("strides", None)
     if strides is None:
         try:
diff --git a/python/cuml/cuml/model_selection/_split.py b/python/cuml/cuml/model_selection/_split.py
index 0727f82c82..227f0eb297 100644
--- a/python/cuml/cuml/model_selection/_split.py
+++ b/python/cuml/cuml/model_selection/_split.py
@@ -265,8 +265,18 @@ def train_test_split(
                              string"
             )
 
-    x_order = array_to_memory_order(X)
-    X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
+    all_numeric = True
+    if isinstance(X, cudf.DataFrame):
+        all_numeric = all(
+            cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns
+        )
+
+    if all_numeric:
+        x_order = array_to_memory_order(X)
+        X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
+    else:
+        x_order = "F"
+        X_arr, X_row = X, X.shape[0]
     if y is not None:
         y_order = array_to_memory_order(y)
         y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order)
@@ -363,55 +373,53 @@ def train_test_split(
         train_indices = range(0, train_size)
         test_indices = range(-1 * test_size, 0)
 
-    # Gather from indices
-    X_train = X_arr[train_indices]
-    X_test = X_arr[test_indices]
-    if y is not None:
-        y_train = y_arr[train_indices]
-        y_test = y_arr[test_indices]
-
-    # Coerce output to original input type
-    if ty := determine_df_obj_type(X):
-        x_type = ty
-    else:
-        x_type = determine_array_type(X)
-
-    if ty := determine_df_obj_type(y):
-        y_type = ty
-    else:
-        y_type = determine_array_type(y)
-
-    if x_type in ("series", "dataframe"):
-        X_train = output_to_df_obj_like(X_train, X, x_type)
-        X_test = output_to_df_obj_like(X_test, X, x_type)
-
-        if determine_array_type(X.index) == "pandas":
-            if isinstance(train_indices, cp.ndarray):
-                train_indices = train_indices.get()
-            if isinstance(test_indices, cp.ndarray):
-                test_indices = test_indices.get()
+    if all_numeric:
+        # Gather from indices
+        X_train = X_arr[train_indices]
+        X_test = X_arr[test_indices]
+        if y is not None:
+            y_train = y_arr[train_indices]
+            y_test = y_arr[test_indices]
+
+        # Coerce output to original input type
+        x_type = determine_df_obj_type(X) or determine_array_type(X)
+        if y is not None:
+            y_type = determine_df_obj_type(y) or determine_array_type(y)
+
+        def _process_df_objs(
+            df, df_type, df_train, df_test, train_indices, test_indices
+        ):
+            if df_type in {"series", "dataframe"}:
+                df_train = output_to_df_obj_like(df_train, df, df_type)
+                df_test = output_to_df_obj_like(df_test, df, df_type)
+
+                if determine_array_type(df.index) == "pandas":
+                    if isinstance(train_indices, cp.ndarray):
+                        train_indices = train_indices.get()
+                    if isinstance(test_indices, cp.ndarray):
+                        test_indices = test_indices.get()
+
+                df_train.index = df.index[train_indices]
+                df_test.index = df.index[test_indices]
+            else:
+                df_train = df_train.to_output(df_type)
+                df_test = df_test.to_output(df_type)
+            return df_train, df_test
+
+        X_train, X_test = _process_df_objs(
+            X, x_type, X_train, X_test, train_indices, test_indices
+        )
+        if y is not None:
+            y_train, y_test = _process_df_objs(
+                y, y_type, y_train, y_test, train_indices, test_indices
+            )
 
-        X_train.index = X.index[train_indices]
-        X_test.index = X.index[test_indices]
     else:
-        X_train = X_train.to_output(x_type)
-        X_test = X_test.to_output(x_type)
-
-    if y_type in ("series", "dataframe"):
-        y_train = output_to_df_obj_like(y_train, y, y_type)
-        y_test = output_to_df_obj_like(y_test, y, y_type)
-
-        if determine_array_type(y.index) == "pandas":
-            if isinstance(train_indices, cp.ndarray):
-                train_indices = train_indices.get()
-            if isinstance(test_indices, cp.ndarray):
-                test_indices = test_indices.get()
-
-        y_train.index = y.index[train_indices]
-        y_test.index = y.index[test_indices]
-    elif y_type is not None:
-        y_train = y_train.to_output(y_type)
-        y_test = y_test.to_output(y_type)
+        X_train = X_arr.iloc[train_indices]
+        X_test = X_arr.iloc[test_indices]
+        if y is not None:
+            y_train = y_arr[train_indices]
+            y_test = y_arr[test_indices]
 
     if y is not None:
         return X_train, X_test, y_train, y_test