From 8f0d7ed76335ec5ae9531be340354fbc3d199b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= Date: Sat, 10 Jun 2023 15:15:44 +0200 Subject: [PATCH 1/2] Add support for pandas 2.0 Fixes #372 --- .github/workflows/tests-workflow.yaml | 6 ++--- ci/appveyor/py310.ps1 | 2 +- ci/appveyor/py311.ps1 | 2 +- ...h => py310_pandas15_numpy121_sklearn12.sh} | 2 +- ...h => py311_pandas20_numpy123_sklearn12.sh} | 2 +- pyproject.toml | 6 +---- sksurv/column.py | 4 ++-- sksurv/datasets/base.py | 22 ++++++++++++------- sksurv/io/arffwrite.py | 2 +- sksurv/util.py | 2 +- tests/conftest.py | 2 +- tests/test_datasets.py | 15 ++++--------- tests/test_io.py | 2 +- tests/test_preprocessing.py | 2 +- 14 files changed, 33 insertions(+), 38 deletions(-) rename ci/deps/{py310_pandas14_numpy121_sklearn12.sh => py310_pandas15_numpy121_sklearn12.sh} (81%) rename ci/deps/{py311_pandas15_numpy123_sklearn12.sh => py311_pandas20_numpy123_sklearn12.sh} (81%) diff --git a/.github/workflows/tests-workflow.yaml b/.github/workflows/tests-workflow.yaml index 552aa756..2b0fc8a2 100644 --- a/.github/workflows/tests-workflow.yaml +++ b/.github/workflows/tests-workflow.yaml @@ -14,8 +14,8 @@ jobs: config: - py38_pandas10_numpy119_sklearn12 - py39_pandas13_numpy121_sklearn12 - - py310_pandas14_numpy121_sklearn12 - - py311_pandas15_numpy123_sklearn12 + - py310_pandas15_numpy121_sklearn12 + - py311_pandas20_numpy123_sklearn12 runner: - ubuntu-latest - macos-latest @@ -101,7 +101,7 @@ jobs: - name: Set dependencies id: dependencies env: - DEPS_CONFIG: py310_pandas14_numpy121_sklearn12 + DEPS_CONFIG: py310_pandas15_numpy121_sklearn12 run: | source ci/deps/${DEPS_CONFIG}.sh echo "CI_PYTHON_VERSION=${CI_PYTHON_VERSION}" >> $GITHUB_ENV diff --git a/ci/appveyor/py310.ps1 b/ci/appveyor/py310.ps1 index b3d87f28..ac407a6f 100644 --- a/ci/appveyor/py310.ps1 +++ b/ci/appveyor/py310.ps1 @@ -1,4 +1,4 @@ $env:CI_PYTHON_VERSION="3.10.*" -$env:CI_PANDAS_VERSION="1.4.*" +$env:CI_PANDAS_VERSION="1.5.*" $env:CI_NUMPY_VERSION="1.21.*" $env:CI_SKLEARN_VERSION="1.2.*" diff --git a/ci/appveyor/py311.ps1 b/ci/appveyor/py311.ps1 index b8e36bf5..d5afd224 100644 --- a/ci/appveyor/py311.ps1 +++ b/ci/appveyor/py311.ps1 @@ -1,4 +1,4 @@ $env:CI_PYTHON_VERSION="3.11.*" -$env:CI_PANDAS_VERSION="1.5.*" +$env:CI_PANDAS_VERSION="2.0.*" $env:CI_NUMPY_VERSION="1.23.*" $env:CI_SKLEARN_VERSION="1.2.*" diff --git a/ci/deps/py310_pandas14_numpy121_sklearn12.sh b/ci/deps/py310_pandas15_numpy121_sklearn12.sh similarity index 81% rename from ci/deps/py310_pandas14_numpy121_sklearn12.sh rename to ci/deps/py310_pandas15_numpy121_sklearn12.sh index 8ca2259a..0ef1d509 100644 --- a/ci/deps/py310_pandas14_numpy121_sklearn12.sh +++ b/ci/deps/py310_pandas15_numpy121_sklearn12.sh @@ -1,6 +1,6 @@ # shellcheck shell=sh export CI_PYTHON_VERSION='3.10.*' -export CI_PANDAS_VERSION='1.4.*' +export CI_PANDAS_VERSION='1.5.*' export CI_NUMPY_VERSION='1.21.*' export CI_SKLEARN_VERSION='1.2.*' export CI_NO_SLOW=true diff --git a/ci/deps/py311_pandas15_numpy123_sklearn12.sh b/ci/deps/py311_pandas20_numpy123_sklearn12.sh similarity index 81% rename from ci/deps/py311_pandas15_numpy123_sklearn12.sh rename to ci/deps/py311_pandas20_numpy123_sklearn12.sh index 673c48e9..9faf8edc 100644 --- a/ci/deps/py311_pandas15_numpy123_sklearn12.sh +++ b/ci/deps/py311_pandas20_numpy123_sklearn12.sh @@ -1,6 +1,6 @@ # shellcheck shell=sh export CI_PYTHON_VERSION='3.11.*' -export CI_PANDAS_VERSION='1.5.*' +export CI_PANDAS_VERSION='2.0.*' export CI_NUMPY_VERSION='1.23.*' export CI_SKLEARN_VERSION='1.2.*' export CI_NO_SLOW=false diff --git a/pyproject.toml b/pyproject.toml index 50d3ab49..e0fc9ddd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ dependencies = [ "numexpr", "numpy", "osqp !=0.6.0,!=0.6.1", - "pandas >=1.0.5,<2", + "pandas >=1.0.5", "scipy >=1.3.2", "scikit-learn >=1.2.0,<1.3", ] @@ -136,10 +136,6 @@ filterwarnings = [ "ignore:The distutils package is deprecated:DeprecationWarning", "ignore:Setuptools is replacing distutils", "ignore:distutils Version classes are deprecated.*:DeprecationWarning", - # check_less_precise has been deprecated in pandas 1.1.0 - "ignore:The 'check_less_precise' keyword in testing\\.assert_\\*_equal is deprecated and will be removed in a future version.*:FutureWarning", - # iteritems has been deprecated in pandas 1.5.0 - "ignore:iteritems is deprecated and will be removed in a future version. Use \\.items instead:FutureWarning", "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning", ] diff --git a/sksurv/column.py b/sksurv/column.py index caa09c22..1e298472 100644 --- a/sksurv/column.py +++ b/sksurv/column.py @@ -134,12 +134,12 @@ def _is_categorical_or_object(series): if columns is None: # for columns containing categories - columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)} + columns_to_encode = {nam for nam, s in table.items() if _is_categorical_or_object(s)} else: columns_to_encode = set(columns) items = [] - for name, series in table.iteritems(): + for name, series in table.items(): if name in columns_to_encode: series = _encode_categorical_series(series, **kwargs) if series is None: diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py index d5bc7432..76e93061 100644 --- a/sksurv/datasets/base.py +++ b/sksurv/datasets/base.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from pandas.api.types import is_categorical_dtype from ..column import categorical_to_numeric, standardize from ..io import loadarff @@ -98,6 +99,17 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True): return _get_x_y_other(data_frame, attr_labels) +def _loadarff_with_index(filename): + dataset = loadarff(filename) + if "index" in dataset.columns: + if is_categorical_dtype(dataset["index"].dtype): + # concatenating categorical index may raise TypeError + # see https://github.com/pandas-dev/pandas/issues/14586 + dataset["index"] = dataset["index"].astype(object) + dataset.set_index("index", inplace=True) + return dataset + + def load_arff_files_standardized( path_training, attr_labels, @@ -154,10 +166,7 @@ def load_arff_files_standardized( y_test : None or pandas.DataFrame, shape = (n_train, n_labels) Dependent variables of testing data if `path_testing` was provided. """ - dataset = loadarff(path_training) - if "index" in dataset.columns: - dataset.index = dataset["index"].astype(object) - dataset.drop("index", axis=1, inplace=True) + dataset = _loadarff_with_index(path_training) x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival) @@ -196,10 +205,7 @@ def load_arff_files_standardized( def _load_arff_testing(path_testing, attr_labels, pos_label, survival): - test_dataset = loadarff(path_testing) - if "index" in test_dataset.columns: - test_dataset.index = test_dataset["index"].astype(object) - test_dataset.drop("index", axis=1, inplace=True) + test_dataset = _loadarff_with_index(path_testing) has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all() if not has_labels: diff --git a/sksurv/io/arffwrite.py b/sksurv/io/arffwrite.py index 781b800f..a24f357c 100644 --- a/sksurv/io/arffwrite.py +++ b/sksurv/io/arffwrite.py @@ -66,7 +66,7 @@ def _write_header(data, fp, relation_name, index): attribute_names = _sanitize_column_names(data) - for column, series in data.iteritems(): + for column, series in data.items(): name = attribute_names[column] fp.write(f"@attribute {name}\t") diff --git a/sksurv/util.py b/sksurv/util.py index b8778143..f192e828 100644 --- a/sksurv/util.py +++ b/sksurv/util.py @@ -243,7 +243,7 @@ def safe_concat(objs, *args, **kwargs): categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered} else: dfc = df.select_dtypes(include=["category"]) - for name, s in dfc.iteritems(): + for name, s in dfc.items(): if name in categories: if axis == 1: raise ValueError(f"duplicate columns {name}") diff --git a/tests/conftest.py b/tests/conftest.py index bd2ea4f2..524c5985 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,7 +58,7 @@ def whas500_sparse_data(): data = [] index_i = [] index_j = [] - for j, (_, col) in enumerate(x_dense.iteritems()): + for j, (_, col) in enumerate(x_dense.items()): idx = np.flatnonzero(col.values) data.extend([1] * len(idx)) index_i.extend(idx) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 08cc769c..774272fe 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -311,7 +311,6 @@ def assert_x_equal(x_true, x_train): check_index_type=False, check_column_type=True, check_names=False, - check_less_precise=True, ) @@ -335,11 +334,8 @@ def arff_2(self): return StringIO(ARFF_CATEGORICAL_INDEX_2) def data_with_categorical_index_1(self): - index = pd.Index( - ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"], - name="index", - dtype=object, - ) + values = ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"] + index = pd.Index(values, name="index", dtype=object) x = pd.DataFrame.from_dict( { "size": pd.Series( @@ -375,11 +371,8 @@ def data_with_categorical_index_1(self): return args, kwargs, x, y, None, None def data_with_categorical_index_2(self): - index = pd.Index( - ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"], - name="index", - dtype=object, - ) + values = ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"] + index = pd.Index(values, name="index", dtype=object) y = pd.DataFrame.from_dict( { diff --git a/tests/test_io.py b/tests/test_io.py index acc8c44e..90f32077 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -70,7 +70,7 @@ def data_nominal_with_quotes(self): def data_nominal_as_category(self): data, rel_name, expected = self.data_nominal_with_quotes() - for name, series in data.iteritems(): + for name, series in data.items(): data[name] = pd.Categorical(series, ordered=False) expected[3] = '@attribute attr_nominal_spaces\t{"hard liquor","red wine",mate}\n' diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 16b3ea1b..a66ac322 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -11,7 +11,7 @@ def _encoded_data(data): expected = [] - for nam, col in data.iteritems(): + for nam, col in data.items(): if hasattr(col, "cat"): for cat in col.cat.categories[1:]: name = f"{nam}={cat}" From fcba890faf8907fc60ea2c9c17a43f72574516ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= Date: Sat, 10 Jun 2023 19:02:09 +0200 Subject: [PATCH 2/2] CI: Use at most pandas 1.5 pandas 2.0 is not available in conda's main channel yet --- .github/workflows/tests-workflow.yaml | 6 +++--- ci/appveyor/py310.ps1 | 2 +- ci/appveyor/py311.ps1 | 2 +- ...21_sklearn12.sh => py310_pandas14_numpy121_sklearn12.sh} | 2 +- ...23_sklearn12.sh => py311_pandas15_numpy123_sklearn12.sh} | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename ci/deps/{py310_pandas15_numpy121_sklearn12.sh => py310_pandas14_numpy121_sklearn12.sh} (81%) rename ci/deps/{py311_pandas20_numpy123_sklearn12.sh => py311_pandas15_numpy123_sklearn12.sh} (81%) diff --git a/.github/workflows/tests-workflow.yaml b/.github/workflows/tests-workflow.yaml index 2b0fc8a2..552aa756 100644 --- a/.github/workflows/tests-workflow.yaml +++ b/.github/workflows/tests-workflow.yaml @@ -14,8 +14,8 @@ jobs: config: - py38_pandas10_numpy119_sklearn12 - py39_pandas13_numpy121_sklearn12 - - py310_pandas15_numpy121_sklearn12 - - py311_pandas20_numpy123_sklearn12 + - py310_pandas14_numpy121_sklearn12 + - py311_pandas15_numpy123_sklearn12 runner: - ubuntu-latest - macos-latest @@ -101,7 +101,7 @@ jobs: - name: Set dependencies id: dependencies env: - DEPS_CONFIG: py310_pandas15_numpy121_sklearn12 + DEPS_CONFIG: py310_pandas14_numpy121_sklearn12 run: | source ci/deps/${DEPS_CONFIG}.sh echo "CI_PYTHON_VERSION=${CI_PYTHON_VERSION}" >> $GITHUB_ENV diff --git a/ci/appveyor/py310.ps1 b/ci/appveyor/py310.ps1 index ac407a6f..b3d87f28 100644 --- a/ci/appveyor/py310.ps1 +++ b/ci/appveyor/py310.ps1 @@ -1,4 +1,4 @@ $env:CI_PYTHON_VERSION="3.10.*" -$env:CI_PANDAS_VERSION="1.5.*" +$env:CI_PANDAS_VERSION="1.4.*" $env:CI_NUMPY_VERSION="1.21.*" $env:CI_SKLEARN_VERSION="1.2.*" diff --git a/ci/appveyor/py311.ps1 b/ci/appveyor/py311.ps1 index d5afd224..b8e36bf5 100644 --- a/ci/appveyor/py311.ps1 +++ b/ci/appveyor/py311.ps1 @@ -1,4 +1,4 @@ $env:CI_PYTHON_VERSION="3.11.*" -$env:CI_PANDAS_VERSION="2.0.*" +$env:CI_PANDAS_VERSION="1.5.*" $env:CI_NUMPY_VERSION="1.23.*" $env:CI_SKLEARN_VERSION="1.2.*" diff --git a/ci/deps/py310_pandas15_numpy121_sklearn12.sh b/ci/deps/py310_pandas14_numpy121_sklearn12.sh similarity index 81% rename from ci/deps/py310_pandas15_numpy121_sklearn12.sh rename to ci/deps/py310_pandas14_numpy121_sklearn12.sh index 0ef1d509..8ca2259a 100644 --- a/ci/deps/py310_pandas15_numpy121_sklearn12.sh +++ b/ci/deps/py310_pandas14_numpy121_sklearn12.sh @@ -1,6 +1,6 @@ # shellcheck shell=sh export CI_PYTHON_VERSION='3.10.*' -export CI_PANDAS_VERSION='1.5.*' +export CI_PANDAS_VERSION='1.4.*' export CI_NUMPY_VERSION='1.21.*' export CI_SKLEARN_VERSION='1.2.*' export CI_NO_SLOW=true diff --git a/ci/deps/py311_pandas20_numpy123_sklearn12.sh b/ci/deps/py311_pandas15_numpy123_sklearn12.sh similarity index 81% rename from ci/deps/py311_pandas20_numpy123_sklearn12.sh rename to ci/deps/py311_pandas15_numpy123_sklearn12.sh index 9faf8edc..673c48e9 100644 --- a/ci/deps/py311_pandas20_numpy123_sklearn12.sh +++ b/ci/deps/py311_pandas15_numpy123_sklearn12.sh @@ -1,6 +1,6 @@ # shellcheck shell=sh export CI_PYTHON_VERSION='3.11.*' -export CI_PANDAS_VERSION='2.0.*' +export CI_PANDAS_VERSION='1.5.*' export CI_NUMPY_VERSION='1.23.*' export CI_SKLEARN_VERSION='1.2.*' export CI_NO_SLOW=false