From 7a226638b937b39ecdf497eb8d04ee07bf06c1a3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jan 2025 07:32:52 -0500 Subject: [PATCH 1/3] TST(string dtype): Resolve xfails in test_from_dummies --- pandas/core/reshape/encoding.py | 17 +++++++++-- pandas/tests/reshape/test_from_dummies.py | 35 +++++++++++++++++++---- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..d7d6ada27ba0f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -17,12 +17,14 @@ is_integer_dtype, is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, ) +from pandas.core.dtypes.missing import isna from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable @@ -554,9 +556,20 @@ def from_dummies( "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) + dtype = data.columns.dtype if any(assigned == 0): if isinstance(default_category, dict): - cats.append(default_category[prefix]) + value = default_category[prefix] + if ( + is_string_dtype(data.columns.dtype) + and not isinstance(value, str) + and (is_list_like(value) or not isna(value)) + ): + # GH#??? + # `value` is not a string or NA. + # Using data.columns.dtype would coerce `value` into a string. + dtype = "object" + cats.append(value) else: raise ValueError( "Dummy DataFrame contains unassigned value(s); " @@ -567,7 +580,7 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) + cats_array = data._constructor_sliced(cats, dtype=dtype) # get indices of True entries along axis=1 true_values = data_slice.idxmax(axis=1) indexer = data_slice.columns.get_indexer_for(true_values) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..4fb48cd21d428 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - +import pandas as pd from pandas import ( DataFrame, Series, @@ -336,8 +335,6 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) expected = DataFrame(expected) - if using_infer_string: - expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) @@ -364,7 +361,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -450,3 +446,32 @@ def test_maintain_original_index(): result = from_dummies(df) expected = DataFrame({"": list("abca")}, index=list("abcd")) tm.assert_frame_equal(result, expected) + + +def test_int_columns_with_float_default(): + # GH#??? + df = DataFrame( + { + 3: [1, 0, 0], + 4: [0, 1, 0], + }, + ) + with pytest.raises(ValueError, match="Trying to coerce float values to integers"): + from_dummies(df, default_category=0.5) + + +def test_object_dtype_preserved(): + # GH#??? + # When the input has object dtype, the result should as + # well even when infer_string is True. + df = DataFrame( + { + "x": [1, 0, 0], + "y": [0, 1, 0], + }, + ) + df.columns = df.columns.astype("object") + with pd.option_context("future.infer_string", True): + result = from_dummies(df, default_category="z") + expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") + tm.assert_frame_equal(result, expected) From 40448cc9ef42358768364087f41d74d27b4d5a95 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jan 2025 12:50:48 -0500 Subject: [PATCH 2/3] Add GH references --- pandas/core/reshape/encoding.py | 2 +- pandas/tests/reshape/test_from_dummies.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index d7d6ada27ba0f..b6c7dc9d1136e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -565,7 +565,7 @@ def from_dummies( and not isinstance(value, str) and (is_list_like(value) or not isna(value)) ): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 # `value` is not a string or NA. # Using data.columns.dtype would coerce `value` into a string. dtype = "object" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 4fb48cd21d428..ef928db329b48 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -449,7 +449,7 @@ def test_maintain_original_index(): def test_int_columns_with_float_default(): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 df = DataFrame( { 3: [1, 0, 0], @@ -461,7 +461,7 @@ def test_int_columns_with_float_default(): def test_object_dtype_preserved(): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 # When the input has object dtype, the result should as # well even when infer_string is True. df = DataFrame( From 35598ee1db7f21368daf43e944dc321e25614773 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Jan 2025 07:12:28 -0500 Subject: [PATCH 3/3] type-hint --- pandas/core/reshape/encoding.py | 7 +++++-- pandas/tests/io/pytables/test_complex.py | 8 +++----- pandas/tests/io/pytables/test_file_handling.py | 6 ++---- pandas/tests/io/pytables/test_timezones.py | 8 +++----- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index b6c7dc9d1136e..2d77549dd0955 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -37,7 +37,10 @@ from pandas.core.series import Series if TYPE_CHECKING: - from pandas._typing import NpDtype + from pandas._typing import ( + DtypeObj, + NpDtype, + ) def get_dummies( @@ -556,7 +559,7 @@ def from_dummies( "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) - dtype = data.columns.dtype + dtype: str | DtypeObj = data.columns.dtype if any(assigned == 0): if isinstance(default_category, dict): value = default_category[prefix] diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c6eb7670f1e73 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,9 +11,9 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) +# pytestmark = pytest.mark.xfail( +# using_string_dtype(), reason="TODO(infer_string)", strict=False +# ) def test_complex_fixed(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 16c3c6798ff76..9359a18d162c0 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -329,7 +327,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -347,7 +345,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8f179f844e4d0..db99f88f0f7ba 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,9 +23,9 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) +# pytestmark = pytest.mark.xfail( +# using_string_dtype(), reason="TODO(infer_string)", strict=False +# ) def _compare_with_tz(a, b):