From 8f0d7ed76335ec5ae9531be340354fbc3d199b21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= <sebp@k-d-w.org>
Date: Sat, 10 Jun 2023 15:15:44 +0200
Subject: [PATCH 1/2] Add support for pandas 2.0

Fixes #372
---
 .github/workflows/tests-workflow.yaml         |  6 ++---
 ci/appveyor/py310.ps1                         |  2 +-
 ci/appveyor/py311.ps1                         |  2 +-
 ...h => py310_pandas15_numpy121_sklearn12.sh} |  2 +-
 ...h => py311_pandas20_numpy123_sklearn12.sh} |  2 +-
 pyproject.toml                                |  6 +----
 sksurv/column.py                              |  4 ++--
 sksurv/datasets/base.py                       | 22 ++++++++++++-------
 sksurv/io/arffwrite.py                        |  2 +-
 sksurv/util.py                                |  2 +-
 tests/conftest.py                             |  2 +-
 tests/test_datasets.py                        | 15 ++++---------
 tests/test_io.py                              |  2 +-
 tests/test_preprocessing.py                   |  2 +-
 14 files changed, 33 insertions(+), 38 deletions(-)
 rename ci/deps/{py310_pandas14_numpy121_sklearn12.sh => py310_pandas15_numpy121_sklearn12.sh} (81%)
 rename ci/deps/{py311_pandas15_numpy123_sklearn12.sh => py311_pandas20_numpy123_sklearn12.sh} (81%)

diff --git a/.github/workflows/tests-workflow.yaml b/.github/workflows/tests-workflow.yaml
index 552aa756..2b0fc8a2 100644
--- a/.github/workflows/tests-workflow.yaml
+++ b/.github/workflows/tests-workflow.yaml
@@ -14,8 +14,8 @@ jobs:
         config:
           - py38_pandas10_numpy119_sklearn12
           - py39_pandas13_numpy121_sklearn12
-          - py310_pandas14_numpy121_sklearn12
-          - py311_pandas15_numpy123_sklearn12
+          - py310_pandas15_numpy121_sklearn12
+          - py311_pandas20_numpy123_sklearn12
         runner:
           - ubuntu-latest
           - macos-latest
@@ -101,7 +101,7 @@ jobs:
       - name: Set dependencies
         id: dependencies
         env:
-          DEPS_CONFIG: py310_pandas14_numpy121_sklearn12
+          DEPS_CONFIG: py310_pandas15_numpy121_sklearn12
         run: |
           source ci/deps/${DEPS_CONFIG}.sh
           echo "CI_PYTHON_VERSION=${CI_PYTHON_VERSION}" >> $GITHUB_ENV
diff --git a/ci/appveyor/py310.ps1 b/ci/appveyor/py310.ps1
index b3d87f28..ac407a6f 100644
--- a/ci/appveyor/py310.ps1
+++ b/ci/appveyor/py310.ps1
@@ -1,4 +1,4 @@
 $env:CI_PYTHON_VERSION="3.10.*"
-$env:CI_PANDAS_VERSION="1.4.*"
+$env:CI_PANDAS_VERSION="1.5.*"
 $env:CI_NUMPY_VERSION="1.21.*"
 $env:CI_SKLEARN_VERSION="1.2.*"
diff --git a/ci/appveyor/py311.ps1 b/ci/appveyor/py311.ps1
index b8e36bf5..d5afd224 100644
--- a/ci/appveyor/py311.ps1
+++ b/ci/appveyor/py311.ps1
@@ -1,4 +1,4 @@
 $env:CI_PYTHON_VERSION="3.11.*"
-$env:CI_PANDAS_VERSION="1.5.*"
+$env:CI_PANDAS_VERSION="2.0.*"
 $env:CI_NUMPY_VERSION="1.23.*"
 $env:CI_SKLEARN_VERSION="1.2.*"
diff --git a/ci/deps/py310_pandas14_numpy121_sklearn12.sh b/ci/deps/py310_pandas15_numpy121_sklearn12.sh
similarity index 81%
rename from ci/deps/py310_pandas14_numpy121_sklearn12.sh
rename to ci/deps/py310_pandas15_numpy121_sklearn12.sh
index 8ca2259a..0ef1d509 100644
--- a/ci/deps/py310_pandas14_numpy121_sklearn12.sh
+++ b/ci/deps/py310_pandas15_numpy121_sklearn12.sh
@@ -1,6 +1,6 @@
 # shellcheck shell=sh
 export CI_PYTHON_VERSION='3.10.*'
-export CI_PANDAS_VERSION='1.4.*'
+export CI_PANDAS_VERSION='1.5.*'
 export CI_NUMPY_VERSION='1.21.*'
 export CI_SKLEARN_VERSION='1.2.*'
 export CI_NO_SLOW=true
diff --git a/ci/deps/py311_pandas15_numpy123_sklearn12.sh b/ci/deps/py311_pandas20_numpy123_sklearn12.sh
similarity index 81%
rename from ci/deps/py311_pandas15_numpy123_sklearn12.sh
rename to ci/deps/py311_pandas20_numpy123_sklearn12.sh
index 673c48e9..9faf8edc 100644
--- a/ci/deps/py311_pandas15_numpy123_sklearn12.sh
+++ b/ci/deps/py311_pandas20_numpy123_sklearn12.sh
@@ -1,6 +1,6 @@
 # shellcheck shell=sh
 export CI_PYTHON_VERSION='3.11.*'
-export CI_PANDAS_VERSION='1.5.*'
+export CI_PANDAS_VERSION='2.0.*'
 export CI_NUMPY_VERSION='1.23.*'
 export CI_SKLEARN_VERSION='1.2.*'
 export CI_NO_SLOW=false
diff --git a/pyproject.toml b/pyproject.toml
index 50d3ab49..e0fc9ddd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,7 @@ dependencies = [
     "numexpr",
     "numpy",
     "osqp !=0.6.0,!=0.6.1",
-    "pandas >=1.0.5,<2",
+    "pandas >=1.0.5",
     "scipy >=1.3.2",
     "scikit-learn >=1.2.0,<1.3",
 ]
@@ -136,10 +136,6 @@ filterwarnings = [
     "ignore:The distutils package is deprecated:DeprecationWarning",
     "ignore:Setuptools is replacing distutils",
     "ignore:distutils Version classes are deprecated.*:DeprecationWarning",
-    # check_less_precise has been deprecated in pandas 1.1.0
-    "ignore:The 'check_less_precise' keyword in testing\\.assert_\\*_equal is deprecated and will be removed in a future version.*:FutureWarning",
-    # iteritems has been deprecated in pandas 1.5.0
-    "ignore:iteritems is deprecated and will be removed in a future version. Use \\.items instead:FutureWarning",
     "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning",
 ]
 
diff --git a/sksurv/column.py b/sksurv/column.py
index caa09c22..1e298472 100644
--- a/sksurv/column.py
+++ b/sksurv/column.py
@@ -134,12 +134,12 @@ def _is_categorical_or_object(series):
 
     if columns is None:
         # for columns containing categories
-        columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)}
+        columns_to_encode = {nam for nam, s in table.items() if _is_categorical_or_object(s)}
     else:
         columns_to_encode = set(columns)
 
     items = []
-    for name, series in table.iteritems():
+    for name, series in table.items():
         if name in columns_to_encode:
             series = _encode_categorical_series(series, **kwargs)
             if series is None:
diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py
index d5bc7432..76e93061 100644
--- a/sksurv/datasets/base.py
+++ b/sksurv/datasets/base.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_categorical_dtype
 
 from ..column import categorical_to_numeric, standardize
 from ..io import loadarff
@@ -98,6 +99,17 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
     return _get_x_y_other(data_frame, attr_labels)
 
 
+def _loadarff_with_index(filename):
+    dataset = loadarff(filename)
+    if "index" in dataset.columns:
+        if is_categorical_dtype(dataset["index"].dtype):
+            # concatenating categorical index may raise TypeError
+            # see https://github.com/pandas-dev/pandas/issues/14586
+            dataset["index"] = dataset["index"].astype(object)
+        dataset.set_index("index", inplace=True)
+    return dataset
+
+
 def load_arff_files_standardized(
     path_training,
     attr_labels,
@@ -154,10 +166,7 @@ def load_arff_files_standardized(
     y_test : None or pandas.DataFrame, shape = (n_train, n_labels)
         Dependent variables of testing data if `path_testing` was provided.
     """
-    dataset = loadarff(path_training)
-    if "index" in dataset.columns:
-        dataset.index = dataset["index"].astype(object)
-        dataset.drop("index", axis=1, inplace=True)
+    dataset = _loadarff_with_index(path_training)
 
     x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival)
 
@@ -196,10 +205,7 @@ def load_arff_files_standardized(
 
 
 def _load_arff_testing(path_testing, attr_labels, pos_label, survival):
-    test_dataset = loadarff(path_testing)
-    if "index" in test_dataset.columns:
-        test_dataset.index = test_dataset["index"].astype(object)
-        test_dataset.drop("index", axis=1, inplace=True)
+    test_dataset = _loadarff_with_index(path_testing)
 
     has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all()
     if not has_labels:
diff --git a/sksurv/io/arffwrite.py b/sksurv/io/arffwrite.py
index 781b800f..a24f357c 100644
--- a/sksurv/io/arffwrite.py
+++ b/sksurv/io/arffwrite.py
@@ -66,7 +66,7 @@ def _write_header(data, fp, relation_name, index):
 
     attribute_names = _sanitize_column_names(data)
 
-    for column, series in data.iteritems():
+    for column, series in data.items():
         name = attribute_names[column]
         fp.write(f"@attribute {name}\t")
 
diff --git a/sksurv/util.py b/sksurv/util.py
index b8778143..f192e828 100644
--- a/sksurv/util.py
+++ b/sksurv/util.py
@@ -243,7 +243,7 @@ def safe_concat(objs, *args, **kwargs):
                 categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered}
         else:
             dfc = df.select_dtypes(include=["category"])
-            for name, s in dfc.iteritems():
+            for name, s in dfc.items():
                 if name in categories:
                     if axis == 1:
                         raise ValueError(f"duplicate columns {name}")
diff --git a/tests/conftest.py b/tests/conftest.py
index bd2ea4f2..524c5985 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -58,7 +58,7 @@ def whas500_sparse_data():
     data = []
     index_i = []
     index_j = []
-    for j, (_, col) in enumerate(x_dense.iteritems()):
+    for j, (_, col) in enumerate(x_dense.items()):
         idx = np.flatnonzero(col.values)
         data.extend([1] * len(idx))
         index_i.extend(idx)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 08cc769c..774272fe 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -311,7 +311,6 @@ def assert_x_equal(x_true, x_train):
         check_index_type=False,
         check_column_type=True,
         check_names=False,
-        check_less_precise=True,
     )
 
 
@@ -335,11 +334,8 @@ def arff_2(self):
         return StringIO(ARFF_CATEGORICAL_INDEX_2)
 
     def data_with_categorical_index_1(self):
-        index = pd.Index(
-            ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"],
-            name="index",
-            dtype=object,
-        )
+        values = ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"]
+        index = pd.Index(values, name="index", dtype=object)
         x = pd.DataFrame.from_dict(
             {
                 "size": pd.Series(
@@ -375,11 +371,8 @@ def data_with_categorical_index_1(self):
         return args, kwargs, x, y, None, None
 
     def data_with_categorical_index_2(self):
-        index = pd.Index(
-            ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"],
-            name="index",
-            dtype=object,
-        )
+        values = ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"]
+        index = pd.Index(values, name="index", dtype=object)
 
         y = pd.DataFrame.from_dict(
             {
diff --git a/tests/test_io.py b/tests/test_io.py
index acc8c44e..90f32077 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -70,7 +70,7 @@ def data_nominal_with_quotes(self):
 
     def data_nominal_as_category(self):
         data, rel_name, expected = self.data_nominal_with_quotes()
-        for name, series in data.iteritems():
+        for name, series in data.items():
             data[name] = pd.Categorical(series, ordered=False)
 
         expected[3] = '@attribute attr_nominal_spaces\t{"hard liquor","red wine",mate}\n'
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 16b3ea1b..a66ac322 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -11,7 +11,7 @@
 
 def _encoded_data(data):
     expected = []
-    for nam, col in data.iteritems():
+    for nam, col in data.items():
         if hasattr(col, "cat"):
             for cat in col.cat.categories[1:]:
                 name = f"{nam}={cat}"

From fcba890faf8907fc60ea2c9c17a43f72574516ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= <sebp@k-d-w.org>
Date: Sat, 10 Jun 2023 19:02:09 +0200
Subject: [PATCH 2/2] CI: Use at most pandas 1.5

pandas 2.0 is not available in conda's main channel yet
---
 .github/workflows/tests-workflow.yaml                       | 6 +++---
 ci/appveyor/py310.ps1                                       | 2 +-
 ci/appveyor/py311.ps1                                       | 2 +-
 ...21_sklearn12.sh => py310_pandas14_numpy121_sklearn12.sh} | 2 +-
 ...23_sklearn12.sh => py311_pandas15_numpy123_sklearn12.sh} | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)
 rename ci/deps/{py310_pandas15_numpy121_sklearn12.sh => py310_pandas14_numpy121_sklearn12.sh} (81%)
 rename ci/deps/{py311_pandas20_numpy123_sklearn12.sh => py311_pandas15_numpy123_sklearn12.sh} (81%)

diff --git a/.github/workflows/tests-workflow.yaml b/.github/workflows/tests-workflow.yaml
index 2b0fc8a2..552aa756 100644
--- a/.github/workflows/tests-workflow.yaml
+++ b/.github/workflows/tests-workflow.yaml
@@ -14,8 +14,8 @@ jobs:
         config:
           - py38_pandas10_numpy119_sklearn12
           - py39_pandas13_numpy121_sklearn12
-          - py310_pandas15_numpy121_sklearn12
-          - py311_pandas20_numpy123_sklearn12
+          - py310_pandas14_numpy121_sklearn12
+          - py311_pandas15_numpy123_sklearn12
         runner:
           - ubuntu-latest
           - macos-latest
@@ -101,7 +101,7 @@ jobs:
       - name: Set dependencies
         id: dependencies
         env:
-          DEPS_CONFIG: py310_pandas15_numpy121_sklearn12
+          DEPS_CONFIG: py310_pandas14_numpy121_sklearn12
         run: |
           source ci/deps/${DEPS_CONFIG}.sh
           echo "CI_PYTHON_VERSION=${CI_PYTHON_VERSION}" >> $GITHUB_ENV
diff --git a/ci/appveyor/py310.ps1 b/ci/appveyor/py310.ps1
index ac407a6f..b3d87f28 100644
--- a/ci/appveyor/py310.ps1
+++ b/ci/appveyor/py310.ps1
@@ -1,4 +1,4 @@
 $env:CI_PYTHON_VERSION="3.10.*"
-$env:CI_PANDAS_VERSION="1.5.*"
+$env:CI_PANDAS_VERSION="1.4.*"
 $env:CI_NUMPY_VERSION="1.21.*"
 $env:CI_SKLEARN_VERSION="1.2.*"
diff --git a/ci/appveyor/py311.ps1 b/ci/appveyor/py311.ps1
index d5afd224..b8e36bf5 100644
--- a/ci/appveyor/py311.ps1
+++ b/ci/appveyor/py311.ps1
@@ -1,4 +1,4 @@
 $env:CI_PYTHON_VERSION="3.11.*"
-$env:CI_PANDAS_VERSION="2.0.*"
+$env:CI_PANDAS_VERSION="1.5.*"
 $env:CI_NUMPY_VERSION="1.23.*"
 $env:CI_SKLEARN_VERSION="1.2.*"
diff --git a/ci/deps/py310_pandas15_numpy121_sklearn12.sh b/ci/deps/py310_pandas14_numpy121_sklearn12.sh
similarity index 81%
rename from ci/deps/py310_pandas15_numpy121_sklearn12.sh
rename to ci/deps/py310_pandas14_numpy121_sklearn12.sh
index 0ef1d509..8ca2259a 100644
--- a/ci/deps/py310_pandas15_numpy121_sklearn12.sh
+++ b/ci/deps/py310_pandas14_numpy121_sklearn12.sh
@@ -1,6 +1,6 @@
 # shellcheck shell=sh
 export CI_PYTHON_VERSION='3.10.*'
-export CI_PANDAS_VERSION='1.5.*'
+export CI_PANDAS_VERSION='1.4.*'
 export CI_NUMPY_VERSION='1.21.*'
 export CI_SKLEARN_VERSION='1.2.*'
 export CI_NO_SLOW=true
diff --git a/ci/deps/py311_pandas20_numpy123_sklearn12.sh b/ci/deps/py311_pandas15_numpy123_sklearn12.sh
similarity index 81%
rename from ci/deps/py311_pandas20_numpy123_sklearn12.sh
rename to ci/deps/py311_pandas15_numpy123_sklearn12.sh
index 9faf8edc..673c48e9 100644
--- a/ci/deps/py311_pandas20_numpy123_sklearn12.sh
+++ b/ci/deps/py311_pandas15_numpy123_sklearn12.sh
@@ -1,6 +1,6 @@
 # shellcheck shell=sh
 export CI_PYTHON_VERSION='3.11.*'
-export CI_PANDAS_VERSION='2.0.*'
+export CI_PANDAS_VERSION='1.5.*'
 export CI_NUMPY_VERSION='1.23.*'
 export CI_SKLEARN_VERSION='1.2.*'
 export CI_NO_SLOW=false