Merge pull request #373 from sebp/pandas-v2

Add support for pandas 2.0
sebp · Jun 10, 2023 · 4021b27 · 4021b27
2 parents c65ab0b + fcba890
commit 4021b27
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 31 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,7 +61,7 @@ dependencies = [
     "numexpr",
     "numpy",
     "osqp !=0.6.0,!=0.6.1",
-    "pandas >=1.0.5,<2",
+    "pandas >=1.0.5",
     "scipy >=1.3.2",
     "scikit-learn >=1.2.0,<1.3",
 ]
@@ -136,10 +136,6 @@ filterwarnings = [
     "ignore:The distutils package is deprecated:DeprecationWarning",
     "ignore:Setuptools is replacing distutils",
     "ignore:distutils Version classes are deprecated.*:DeprecationWarning",
-    # check_less_precise has been deprecated in pandas 1.1.0
-    "ignore:The 'check_less_precise' keyword in testing\\.assert_\\*_equal is deprecated and will be removed in a future version.*:FutureWarning",
-    # iteritems has been deprecated in pandas 1.5.0
-    "ignore:iteritems is deprecated and will be removed in a future version. Use \\.items instead:FutureWarning",
     "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning",
 ]
 

diff --git a/sksurv/column.py b/sksurv/column.py
@@ -134,12 +134,12 @@ def _is_categorical_or_object(series):
 
     if columns is None:
         # for columns containing categories
-        columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)}
+        columns_to_encode = {nam for nam, s in table.items() if _is_categorical_or_object(s)}
     else:
         columns_to_encode = set(columns)
 
     items = []
-    for name, series in table.iteritems():
+    for name, series in table.items():
         if name in columns_to_encode:
             series = _encode_categorical_series(series, **kwargs)
             if series is None:

diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_categorical_dtype
 
 from ..column import categorical_to_numeric, standardize
 from ..io import loadarff
@@ -98,6 +99,17 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
     return _get_x_y_other(data_frame, attr_labels)
 
 
+def _loadarff_with_index(filename):
+    dataset = loadarff(filename)
+    if "index" in dataset.columns:
+        if is_categorical_dtype(dataset["index"].dtype):
+            # concatenating categorical index may raise TypeError
+            # see https://github.com/pandas-dev/pandas/issues/14586
+            dataset["index"] = dataset["index"].astype(object)
+        dataset.set_index("index", inplace=True)
+    return dataset
+
+
 def load_arff_files_standardized(
     path_training,
     attr_labels,
@@ -154,10 +166,7 @@ def load_arff_files_standardized(
     y_test : None or pandas.DataFrame, shape = (n_train, n_labels)
         Dependent variables of testing data if `path_testing` was provided.
     """
-    dataset = loadarff(path_training)
-    if "index" in dataset.columns:
-        dataset.index = dataset["index"].astype(object)
-        dataset.drop("index", axis=1, inplace=True)
+    dataset = _loadarff_with_index(path_training)
 
     x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival)
 
@@ -196,10 +205,7 @@ def load_arff_files_standardized(
 
 
 def _load_arff_testing(path_testing, attr_labels, pos_label, survival):
-    test_dataset = loadarff(path_testing)
-    if "index" in test_dataset.columns:
-        test_dataset.index = test_dataset["index"].astype(object)
-        test_dataset.drop("index", axis=1, inplace=True)
+    test_dataset = _loadarff_with_index(path_testing)
 
     has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all()
     if not has_labels:

diff --git a/sksurv/io/arffwrite.py b/sksurv/io/arffwrite.py
@@ -66,7 +66,7 @@ def _write_header(data, fp, relation_name, index):
 
     attribute_names = _sanitize_column_names(data)
 
-    for column, series in data.iteritems():
+    for column, series in data.items():
         name = attribute_names[column]
         fp.write(f"@attribute {name}\t")
 

diff --git a/sksurv/util.py b/sksurv/util.py
@@ -243,7 +243,7 @@ def safe_concat(objs, *args, **kwargs):
                 categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered}
         else:
             dfc = df.select_dtypes(include=["category"])
-            for name, s in dfc.iteritems():
+            for name, s in dfc.items():
                 if name in categories:
                     if axis == 1:
                         raise ValueError(f"duplicate columns {name}")

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -58,7 +58,7 @@ def whas500_sparse_data():
     data = []
     index_i = []
     index_j = []
-    for j, (_, col) in enumerate(x_dense.iteritems()):
+    for j, (_, col) in enumerate(x_dense.items()):
         idx = np.flatnonzero(col.values)
         data.extend([1] * len(idx))
         index_i.extend(idx)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -311,7 +311,6 @@ def assert_x_equal(x_true, x_train):
         check_index_type=False,
         check_column_type=True,
         check_names=False,
-        check_less_precise=True,
     )
 
 
@@ -335,11 +334,8 @@ def arff_2(self):
         return StringIO(ARFF_CATEGORICAL_INDEX_2)
 
     def data_with_categorical_index_1(self):
-        index = pd.Index(
-            ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"],
-            name="index",
-            dtype=object,
-        )
+        values = ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"]
+        index = pd.Index(values, name="index", dtype=object)
         x = pd.DataFrame.from_dict(
             {
                 "size": pd.Series(
@@ -375,11 +371,8 @@ def data_with_categorical_index_1(self):
         return args, kwargs, x, y, None, None
 
     def data_with_categorical_index_2(self):
-        index = pd.Index(
-            ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"],
-            name="index",
-            dtype=object,
-        )
+        values = ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"]
+        index = pd.Index(values, name="index", dtype=object)
 
         y = pd.DataFrame.from_dict(
             {

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -70,7 +70,7 @@ def data_nominal_with_quotes(self):
 
     def data_nominal_as_category(self):
         data, rel_name, expected = self.data_nominal_with_quotes()
-        for name, series in data.iteritems():
+        for name, series in data.items():
             data[name] = pd.Categorical(series, ordered=False)
 
         expected[3] = '@attribute attr_nominal_spaces\t{"hard liquor","red wine",mate}\n'

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -11,7 +11,7 @@
 
 def _encoded_data(data):
     expected = []
-    for nam, col in data.iteritems():
+    for nam, col in data.items():
         if hasattr(col, "cat"):
             for cat in col.cat.categories[1:]:
                 name = f"{nam}={cat}"