From d2f509eeea06c0b6dd344495cd9d4065ae60ae2d Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Thu, 8 Feb 2024 17:59:13 -0600
Subject: [PATCH] Pandas 2.x support (#5758)

Adds changes to account for cuDF support of Pandas 2.x

Fixes #5759.

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Divye Gala (https://github.com/divyegala)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuml/pull/5758
---
 python/cuml/benchmark/datagen.py           | 2 +-
 python/cuml/common/sparsefuncs.py          | 3 ++-
 python/cuml/preprocessing/encoders.py      | 2 +-
 python/cuml/tests/test_train_test_split.py | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py
index 2f49ca292e..94acbb5c17 100644
--- a/python/cuml/benchmark/datagen.py
+++ b/python/cuml/benchmark/datagen.py
@@ -73,7 +73,7 @@ def _gen_data_regression(
     )
 
     X_df = cudf.DataFrame(X_arr)
-    y_df = cudf.Series(y_arr)
+    y_df = cudf.Series(np.squeeze(y_arr))
 
     return X_df, y_df
 
diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py
index 4648163dc6..f50f70b550 100644
--- a/python/cuml/common/sparsefuncs.py
+++ b/python/cuml/common/sparsefuncs.py
@@ -160,8 +160,9 @@ def create_csr_matrix_from_count_df(
 
     doc_token_counts = count_df["doc_id"].value_counts().reset_index()
     del count_df
+
     doc_token_counts = doc_token_counts.rename(
-        {"doc_id": "token_counts", "index": "doc_id"}, axis=1
+        {"count": "token_counts"}, axis=1
     ).sort_values(by="doc_id")
 
     token_counts = _insert_zeros(
diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py
index cc27320490..46500b766a 100644
--- a/python/cuml/preprocessing/encoders.py
+++ b/python/cuml/preprocessing/encoders.py
@@ -498,7 +498,7 @@ def inverse_transform(self, X):
                 dropped_class_idx = Series(self.drop_idx_[feature])
                 dropped_class_mask = Series(cats).isin(cats[dropped_class_idx])
                 if len(cats) == 1:
-                    inv = Series(Index(cats[0]).repeat(X.shape[0]))
+                    inv = Series(Index([cats[0]]).repeat(X.shape[0]))
                     result[feature] = inv
                     continue
                 cats = cats[~dropped_class_mask]
diff --git a/python/cuml/tests/test_train_test_split.py b/python/cuml/tests/test_train_test_split.py
index b6dd4d7847..e0f450176b 100644
--- a/python/cuml/tests/test_train_test_split.py
+++ b/python/cuml/tests/test_train_test_split.py
@@ -48,7 +48,7 @@ def test_split_dataframe(train_size, shuffle):
     assert all(X_test.index.to_pandas() == y_test.index.to_pandas())
 
     X_reconstructed = cudf.concat([X_train, X_test]).sort_values(by=["x"])
-    y_reconstructed = y_train.append(y_test).sort_values()
+    y_reconstructed = cudf.concat([y_train, y_test]).sort_values()
 
     assert all(X_reconstructed.reset_index(drop=True) == X)
     out = y_reconstructed.reset_index(drop=True).values_host == y.values_host
@@ -96,7 +96,7 @@ def test_split_column():
     )
 
     X_reconstructed = cudf.concat([X_train, X_test]).sort_values(by=["x"])
-    y_reconstructed = y_train.append(y_test).sort_values()
+    y_reconstructed = cudf.concat([y_train, y_test]).sort_values()
 
     assert all(
         data