diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0f7be8cfbcb68..de5d332658e93 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -210,6 +210,7 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) +- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`) - :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ab667d92b7e04..4acf15aa92744 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -115,6 +115,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, BaseMaskedDtype, + CategoricalDtype, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -11680,6 +11681,10 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() + + if method in ("spearman", "kendall"): + data = data._transform_ord_cat_cols_to_coded_cols() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": @@ -11969,6 +11974,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): + left = left._transform_ord_cat_cols_to_coded_cols() + right = right._transform_ord_cat_cols_to_coded_cols() def c(x): return nanops.nancorr(x[0], x[1], method=method) @@ -12000,6 +12007,39 @@ def c(x): return correl + def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: + """ + any ordered categorical columns are transformed to the respective + categorical codes while other columns remain untouched + """ + categ = self.select_dtypes("category") + if len(categ.columns) == 0: + return self + + data = self.copy(deep=False) + cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique() + single_cols = [col for col in cols_convert if isinstance(data[col], Series)] + duplicated_cols = [ + col for col in cols_convert if isinstance(data[col], DataFrame) + ] + + if not single_cols and not duplicated_cols: + return self + + if single_cols: + data[single_cols] = data[single_cols].apply( + lambda x: x.cat.codes.replace(-1, np.nan) + ) + + if duplicated_cols: + data[duplicated_cols] = data[duplicated_cols].apply( + lambda x: x.cat.codes.replace(-1, np.nan) + if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered) + else x + ) + + return data + # ---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/core/series.py b/pandas/core/series.py index 9cac1bdd25bbe..a5060da48729b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2685,6 +2685,12 @@ def corr( if len(this) == 0: return np.nan + if method in ("spearman", "kendall"): + if this.dtype == "category" and this.cat.ordered: + this = this.cat.codes.replace(-1, np.nan) + if other.dtype == "category" and other.cat.ordered: + other = other.cat.codes.replace(-1, np.nan) + this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5ed2e86283e9..2554e8c8220d3 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,3 +1,5 @@ +from itertools import combinations + import numpy as np import pytest @@ -252,6 +254,46 @@ def test_corr_numeric_only(self, meth, numeric_only): with pytest.raises(ValueError, match="could not convert string to float"): df.corr(meth, numeric_only=numeric_only) + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @td.skip_if_no("scipy") + def test_corr_rank_ordered_categorical( + self, + method, + ): + df = DataFrame( + { + "ord_cat": Series( + pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "ord_cat_none": Series( + pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ) + ), + "ord_int": Series([0, 1, 2, 3]), + "ord_float": Series([2.0, 3.0, 4.5, 6.5]), + "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]), + "ord_cat_shuff": Series( + pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "ord_int_shuff": Series([2, 3, 0, 1]), + } + ) + corr_calc = df.corr(method=method) + for col1, col2 in combinations(df.columns, r=2): + corr_expected = df[col1].corr(df[col2], method=method) + tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) + class TestDataFrameCorrWith: @pytest.mark.parametrize( @@ -493,3 +535,50 @@ def test_cov_with_missing_values(self): result2 = df.dropna().cov() tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_rank_ordered_categorical( + self, + method, + ): + pytest.importorskip("scipy") + df1 = DataFrame( + { + "a": Series( + pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "b": Series( + pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ) + ), + "c": Series([0, 1, 2, 3]), + "d": Series([2.0, 3.0, 4.5, 6.5]), + } + ) + + df2 = DataFrame( + { + "a": Series([2.0, 3.0, 4.5, np.nan]), + "b": Series( + pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "c": Series([2, 3, 0, 1]), + "d": Series([2.0, 3.0, 4.5, 6.5]), + } + ) + + corr_calc = df1.corrwith(df2, method=method) + for col in df1.columns: + corr_expected = df1[col].corr(df2[col], method=method) + tm.assert_almost_equal(corr_calc.get(col), corr_expected) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 7a4d48fb76940..6d1f439f6ccd0 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -184,3 +184,77 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_rank_ordered_categorical( + self, + method, + ): + stats = pytest.importorskip("scipy.stats") + method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr} + ser_ord_cat = Series( + pd.Categorical( + ["low", "med", "high", "very_high"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ) + ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) + ser_ord_int = Series([0, 1, 2, 3]) + ser_ord_float = Series([2.0, 3.0, 4.5, 6.5]) + + corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_int, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_float, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + ser_ord_cat_shuff = Series( + pd.Categorical( + ["high", "low", "very_high", "med"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ) + ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) + + corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + ser_ord_cat_with_nan = Series( + pd.Categorical( + ["h", "low", "vh", None, "m"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ) + ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace( + -1, np.nan + ) + ser_ord_int = Series([2, 0, 1, 3, None]) + corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method) + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected)