diff --git a/pyproject.toml b/pyproject.toml index 50d3ab49..e0fc9ddd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ dependencies = [ "numexpr", "numpy", "osqp !=0.6.0,!=0.6.1", - "pandas >=1.0.5,<2", + "pandas >=1.0.5", "scipy >=1.3.2", "scikit-learn >=1.2.0,<1.3", ] @@ -136,10 +136,6 @@ filterwarnings = [ "ignore:The distutils package is deprecated:DeprecationWarning", "ignore:Setuptools is replacing distutils", "ignore:distutils Version classes are deprecated.*:DeprecationWarning", - # check_less_precise has been deprecated in pandas 1.1.0 - "ignore:The 'check_less_precise' keyword in testing\\.assert_\\*_equal is deprecated and will be removed in a future version.*:FutureWarning", - # iteritems has been deprecated in pandas 1.5.0 - "ignore:iteritems is deprecated and will be removed in a future version. Use \\.items instead:FutureWarning", "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning", ] diff --git a/sksurv/column.py b/sksurv/column.py index caa09c22..1e298472 100644 --- a/sksurv/column.py +++ b/sksurv/column.py @@ -134,12 +134,12 @@ def _is_categorical_or_object(series): if columns is None: # for columns containing categories - columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)} + columns_to_encode = {nam for nam, s in table.items() if _is_categorical_or_object(s)} else: columns_to_encode = set(columns) items = [] - for name, series in table.iteritems(): + for name, series in table.items(): if name in columns_to_encode: series = _encode_categorical_series(series, **kwargs) if series is None: diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py index d5bc7432..76e93061 100644 --- a/sksurv/datasets/base.py +++ b/sksurv/datasets/base.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from pandas.api.types import is_categorical_dtype from ..column import categorical_to_numeric, standardize from ..io import loadarff @@ -98,6 +99,17 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True): return _get_x_y_other(data_frame, attr_labels) +def _loadarff_with_index(filename): + dataset = loadarff(filename) + if "index" in dataset.columns: + if is_categorical_dtype(dataset["index"].dtype): + # concatenating categorical index may raise TypeError + # see https://github.com/pandas-dev/pandas/issues/14586 + dataset["index"] = dataset["index"].astype(object) + dataset.set_index("index", inplace=True) + return dataset + + def load_arff_files_standardized( path_training, attr_labels, @@ -154,10 +166,7 @@ def load_arff_files_standardized( y_test : None or pandas.DataFrame, shape = (n_train, n_labels) Dependent variables of testing data if `path_testing` was provided. """ - dataset = loadarff(path_training) - if "index" in dataset.columns: - dataset.index = dataset["index"].astype(object) - dataset.drop("index", axis=1, inplace=True) + dataset = _loadarff_with_index(path_training) x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival) @@ -196,10 +205,7 @@ def load_arff_files_standardized( def _load_arff_testing(path_testing, attr_labels, pos_label, survival): - test_dataset = loadarff(path_testing) - if "index" in test_dataset.columns: - test_dataset.index = test_dataset["index"].astype(object) - test_dataset.drop("index", axis=1, inplace=True) + test_dataset = _loadarff_with_index(path_testing) has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all() if not has_labels: diff --git a/sksurv/io/arffwrite.py b/sksurv/io/arffwrite.py index 781b800f..a24f357c 100644 --- a/sksurv/io/arffwrite.py +++ b/sksurv/io/arffwrite.py @@ -66,7 +66,7 @@ def _write_header(data, fp, relation_name, index): attribute_names = _sanitize_column_names(data) - for column, series in data.iteritems(): + for column, series in data.items(): name = attribute_names[column] fp.write(f"@attribute {name}\t") diff --git a/sksurv/util.py b/sksurv/util.py index 92f32852..ba764aff 100644 --- a/sksurv/util.py +++ b/sksurv/util.py @@ -243,7 +243,7 @@ def safe_concat(objs, *args, **kwargs): categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered} else: dfc = df.select_dtypes(include=["category"]) - for name, s in dfc.iteritems(): + for name, s in dfc.items(): if name in categories: if axis == 1: raise ValueError(f"duplicate columns {name}") diff --git a/tests/conftest.py b/tests/conftest.py index bd2ea4f2..524c5985 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,7 +58,7 @@ def whas500_sparse_data(): data = [] index_i = [] index_j = [] - for j, (_, col) in enumerate(x_dense.iteritems()): + for j, (_, col) in enumerate(x_dense.items()): idx = np.flatnonzero(col.values) data.extend([1] * len(idx)) index_i.extend(idx) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 08cc769c..774272fe 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -311,7 +311,6 @@ def assert_x_equal(x_true, x_train): check_index_type=False, check_column_type=True, check_names=False, - check_less_precise=True, ) @@ -335,11 +334,8 @@ def arff_2(self): return StringIO(ARFF_CATEGORICAL_INDEX_2) def data_with_categorical_index_1(self): - index = pd.Index( - ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"], - name="index", - dtype=object, - ) + values = ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"] + index = pd.Index(values, name="index", dtype=object) x = pd.DataFrame.from_dict( { "size": pd.Series( @@ -375,11 +371,8 @@ def data_with_categorical_index_1(self): return args, kwargs, x, y, None, None def data_with_categorical_index_2(self): - index = pd.Index( - ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"], - name="index", - dtype=object, - ) + values = ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"] + index = pd.Index(values, name="index", dtype=object) y = pd.DataFrame.from_dict( { diff --git a/tests/test_io.py b/tests/test_io.py index acc8c44e..90f32077 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -70,7 +70,7 @@ def data_nominal_with_quotes(self): def data_nominal_as_category(self): data, rel_name, expected = self.data_nominal_with_quotes() - for name, series in data.iteritems(): + for name, series in data.items(): data[name] = pd.Categorical(series, ordered=False) expected[3] = '@attribute attr_nominal_spaces\t{"hard liquor","red wine",mate}\n' diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 16b3ea1b..a66ac322 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -11,7 +11,7 @@ def _encoded_data(data): expected = [] - for nam, col in data.iteritems(): + for nam, col in data.items(): if hasattr(col, "cat"): for cat in col.cat.categories[1:]: name = f"{nam}={cat}"