Skip to content

Commit

Permalink
FIX train_test_split for string columns
Browse files Browse the repository at this point in the history
  • Loading branch information
dantegd committed Sep 30, 2024
1 parent 21f9882 commit 322018b
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 52 deletions.
15 changes: 8 additions & 7 deletions python/cuml/cuml/internals/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"):
return arr.order
except AttributeError:
pass
try:
array_interface = arr.__cuda_array_interface__
except AttributeError:
try:
array_interface = arr.__array_interface__
except AttributeError:
return array_to_memory_order(CumlArray.from_input(arr, order="K"))
array_interface = getattr(
arr,
"__cuda_array_interface__",
getattr(arr, "__array_interface__", False),
)
if not array_interface:
return array_to_memory_order(CumlArray.from_input(arr, order="K"))

strides = array_interface.get("strides", None)
if strides is None:
try:
Expand Down
108 changes: 63 additions & 45 deletions python/cuml/cuml/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,18 @@ def train_test_split(
string"
)

x_order = array_to_memory_order(X)
X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
all_numeric = True
if isinstance(X, cudf.DataFrame):
all_numeric = all(
cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns
)

if all_numeric:
x_order = array_to_memory_order(X)
X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
else:
x_order = "F"
X_arr, X_row = X, X.shape[0]
if y is not None:
y_order = array_to_memory_order(y)
y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order)
Expand Down Expand Up @@ -363,55 +373,63 @@ def train_test_split(
train_indices = range(0, train_size)
test_indices = range(-1 * test_size, 0)

# Gather from indices
X_train = X_arr[train_indices]
X_test = X_arr[test_indices]
if y is not None:
y_train = y_arr[train_indices]
y_test = y_arr[test_indices]
if all_numeric:
# Gather from indices
X_train = X_arr[train_indices]
X_test = X_arr[test_indices]
if y is not None:
y_train = y_arr[train_indices]
y_test = y_arr[test_indices]

# Coerce output to original input type
if ty := determine_df_obj_type(X):
x_type = ty
else:
x_type = determine_array_type(X)

# Coerce output to original input type
if ty := determine_df_obj_type(X):
x_type = ty
else:
x_type = determine_array_type(X)
if ty := determine_df_obj_type(y):
y_type = ty
else:
y_type = determine_array_type(y)

if ty := determine_df_obj_type(y):
y_type = ty
else:
y_type = determine_array_type(y)
if x_type in ("series", "dataframe"):
X_train = output_to_df_obj_like(X_train, X, x_type)
X_test = output_to_df_obj_like(X_test, X, x_type)

if determine_array_type(X.index) == "pandas":
if isinstance(train_indices, cp.ndarray):
train_indices = train_indices.get()
if isinstance(test_indices, cp.ndarray):
test_indices = test_indices.get()

X_train.index = X.index[train_indices]
X_test.index = X.index[test_indices]
else:
X_train = X_train.to_output(x_type)
X_test = X_test.to_output(x_type)

if y_type in ("series", "dataframe"):
y_train = output_to_df_obj_like(y_train, y, y_type)
y_test = output_to_df_obj_like(y_test, y, y_type)

if x_type in ("series", "dataframe"):
X_train = output_to_df_obj_like(X_train, X, x_type)
X_test = output_to_df_obj_like(X_test, X, x_type)
if determine_array_type(y.index) == "pandas":
if isinstance(train_indices, cp.ndarray):
train_indices = train_indices.get()
if isinstance(test_indices, cp.ndarray):
test_indices = test_indices.get()

if determine_array_type(X.index) == "pandas":
if isinstance(train_indices, cp.ndarray):
train_indices = train_indices.get()
if isinstance(test_indices, cp.ndarray):
test_indices = test_indices.get()
y_train.index = y.index[train_indices]
y_test.index = y.index[test_indices]
elif y_type is not None:
y_train = y_train.to_output(y_type)
y_test = y_test.to_output(y_type)

X_train.index = X.index[train_indices]
X_test.index = X.index[test_indices]
else:
X_train = X_train.to_output(x_type)
X_test = X_test.to_output(x_type)

if y_type in ("series", "dataframe"):
y_train = output_to_df_obj_like(y_train, y, y_type)
y_test = output_to_df_obj_like(y_test, y, y_type)

if determine_array_type(y.index) == "pandas":
if isinstance(train_indices, cp.ndarray):
train_indices = train_indices.get()
if isinstance(test_indices, cp.ndarray):
test_indices = test_indices.get()

y_train.index = y.index[train_indices]
y_test.index = y.index[test_indices]
elif y_type is not None:
y_train = y_train.to_output(y_type)
y_test = y_test.to_output(y_type)
X_train = X_arr.iloc[train_indices]
X_test = X_arr.iloc[test_indices]
if y is not None:
y_train = y_arr[train_indices]
y_test = y_arr[test_indices]

if y is not None:
return X_train, X_test, y_train, y_test
Expand Down

0 comments on commit 322018b

Please sign in to comment.