Skip to content

Commit

Permalink
Merge pull request #373 from sebp/pandas-v2
Browse files Browse the repository at this point in the history
Add support for pandas 2.0
  • Loading branch information
sebp authored Jun 10, 2023
2 parents c65ab0b + fcba890 commit 4021b27
Show file tree
Hide file tree
Showing 9 changed files with 26 additions and 31 deletions.
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ dependencies = [
"numexpr",
"numpy",
"osqp !=0.6.0,!=0.6.1",
"pandas >=1.0.5,<2",
"pandas >=1.0.5",
"scipy >=1.3.2",
"scikit-learn >=1.2.0,<1.3",
]
Expand Down Expand Up @@ -136,10 +136,6 @@ filterwarnings = [
"ignore:The distutils package is deprecated:DeprecationWarning",
"ignore:Setuptools is replacing distutils",
"ignore:distutils Version classes are deprecated.*:DeprecationWarning",
# check_less_precise has been deprecated in pandas 1.1.0
"ignore:The 'check_less_precise' keyword in testing\\.assert_\\*_equal is deprecated and will be removed in a future version.*:FutureWarning",
# iteritems has been deprecated in pandas 1.5.0
"ignore:iteritems is deprecated and will be removed in a future version. Use \\.items instead:FutureWarning",
"ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning",
]

Expand Down
4 changes: 2 additions & 2 deletions sksurv/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ def _is_categorical_or_object(series):

if columns is None:
# for columns containing categories
columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)}
columns_to_encode = {nam for nam, s in table.items() if _is_categorical_or_object(s)}
else:
columns_to_encode = set(columns)

items = []
for name, series in table.iteritems():
for name, series in table.items():
if name in columns_to_encode:
series = _encode_categorical_series(series, **kwargs)
if series is None:
Expand Down
22 changes: 14 additions & 8 deletions sksurv/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype

from ..column import categorical_to_numeric, standardize
from ..io import loadarff
Expand Down Expand Up @@ -98,6 +99,17 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
return _get_x_y_other(data_frame, attr_labels)


def _loadarff_with_index(filename):
dataset = loadarff(filename)
if "index" in dataset.columns:
if is_categorical_dtype(dataset["index"].dtype):
# concatenating categorical index may raise TypeError
# see https://github.com/pandas-dev/pandas/issues/14586
dataset["index"] = dataset["index"].astype(object)
dataset.set_index("index", inplace=True)
return dataset


def load_arff_files_standardized(
path_training,
attr_labels,
Expand Down Expand Up @@ -154,10 +166,7 @@ def load_arff_files_standardized(
y_test : None or pandas.DataFrame, shape = (n_train, n_labels)
Dependent variables of testing data if `path_testing` was provided.
"""
dataset = loadarff(path_training)
if "index" in dataset.columns:
dataset.index = dataset["index"].astype(object)
dataset.drop("index", axis=1, inplace=True)
dataset = _loadarff_with_index(path_training)

x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival)

Expand Down Expand Up @@ -196,10 +205,7 @@ def load_arff_files_standardized(


def _load_arff_testing(path_testing, attr_labels, pos_label, survival):
test_dataset = loadarff(path_testing)
if "index" in test_dataset.columns:
test_dataset.index = test_dataset["index"].astype(object)
test_dataset.drop("index", axis=1, inplace=True)
test_dataset = _loadarff_with_index(path_testing)

has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all()
if not has_labels:
Expand Down
2 changes: 1 addition & 1 deletion sksurv/io/arffwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _write_header(data, fp, relation_name, index):

attribute_names = _sanitize_column_names(data)

for column, series in data.iteritems():
for column, series in data.items():
name = attribute_names[column]
fp.write(f"@attribute {name}\t")

Expand Down
2 changes: 1 addition & 1 deletion sksurv/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def safe_concat(objs, *args, **kwargs):
categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered}
else:
dfc = df.select_dtypes(include=["category"])
for name, s in dfc.iteritems():
for name, s in dfc.items():
if name in categories:
if axis == 1:
raise ValueError(f"duplicate columns {name}")
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def whas500_sparse_data():
data = []
index_i = []
index_j = []
for j, (_, col) in enumerate(x_dense.iteritems()):
for j, (_, col) in enumerate(x_dense.items()):
idx = np.flatnonzero(col.values)
data.extend([1] * len(idx))
index_i.extend(idx)
Expand Down
15 changes: 4 additions & 11 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,6 @@ def assert_x_equal(x_true, x_train):
check_index_type=False,
check_column_type=True,
check_names=False,
check_less_precise=True,
)


Expand All @@ -335,11 +334,8 @@ def arff_2(self):
return StringIO(ARFF_CATEGORICAL_INDEX_2)

def data_with_categorical_index_1(self):
index = pd.Index(
["SampleOne", "SampleTwo", "SampleThree", "SampleFour"],
name="index",
dtype=object,
)
values = ["SampleOne", "SampleTwo", "SampleThree", "SampleFour"]
index = pd.Index(values, name="index", dtype=object)
x = pd.DataFrame.from_dict(
{
"size": pd.Series(
Expand Down Expand Up @@ -375,11 +371,8 @@ def data_with_categorical_index_1(self):
return args, kwargs, x, y, None, None

def data_with_categorical_index_2(self):
index = pd.Index(
["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"],
name="index",
dtype=object,
)
values = ["ASampleOne", "ASampleTwo", "ASampleThree", "ASampleFour", "ASampleFive"]
index = pd.Index(values, name="index", dtype=object)

y = pd.DataFrame.from_dict(
{
Expand Down
2 changes: 1 addition & 1 deletion tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def data_nominal_with_quotes(self):

def data_nominal_as_category(self):
data, rel_name, expected = self.data_nominal_with_quotes()
for name, series in data.iteritems():
for name, series in data.items():
data[name] = pd.Categorical(series, ordered=False)

expected[3] = '@attribute attr_nominal_spaces\t{"hard liquor","red wine",mate}\n'
Expand Down
2 changes: 1 addition & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def _encoded_data(data):
expected = []
for nam, col in data.iteritems():
for nam, col in data.items():
if hasattr(col, "cat"):
for cat in col.cat.categories[1:]:
name = f"{nam}={cat}"
Expand Down

0 comments on commit 4021b27

Please sign in to comment.