Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST(string dtype): Resolve xfails in test_from_dummies #60694

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
is_integer_dtype,
is_list_like,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
)
from pandas.core.dtypes.missing import isna

from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import factorize_from_iterable
Expand All @@ -35,7 +37,10 @@
from pandas.core.series import Series

if TYPE_CHECKING:
from pandas._typing import NpDtype
from pandas._typing import (
DtypeObj,
NpDtype,
)


def get_dummies(
Expand Down Expand Up @@ -554,9 +559,20 @@ def from_dummies(
"Dummy DataFrame contains multi-assignment(s); "
f"First instance in row: {assigned.idxmax()}"
)
dtype: str | DtypeObj = data.columns.dtype
if any(assigned == 0):
if isinstance(default_category, dict):
cats.append(default_category[prefix])
value = default_category[prefix]
if (
is_string_dtype(data.columns.dtype)
and not isinstance(value, str)
and (is_list_like(value) or not isna(value))
):
# https://github.com/pandas-dev/pandas/pull/60694
# `value` is not a string or NA.
# Using data.columns.dtype would coerce `value` into a string.
dtype = "object"
cats.append(value)
else:
raise ValueError(
"Dummy DataFrame contains unassigned value(s); "
Expand All @@ -567,7 +583,7 @@ def from_dummies(
)
else:
data_slice = data_to_decode.loc[:, prefix_slice]
cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
cats_array = data._constructor_sliced(cats, dtype=dtype)
# get indices of True entries along axis=1
true_values = data_slice.idxmax(axis=1)
indexer = data_slice.columns.get_indexer_for(true_values)
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/io/pytables/test_complex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -13,9 +11,9 @@

from pandas.io.pytables import read_hdf

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)
# pytestmark = pytest.mark.xfail(
# using_string_dtype(), reason="TODO(infer_string)", strict=False
# )


def test_complex_fixed(tmp_path, setup_path):
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/io/pytables/test_file_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import (
PY311,
is_ci_environment,
Expand Down Expand Up @@ -329,7 +327,7 @@ def test_complibs(tmp_path, lvl, lib, request):
assert node.filters.complib == lib


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.skipif(
not is_platform_little_endian(), reason="reason platform is not little endian"
)
Expand All @@ -347,7 +345,7 @@ def test_encoding(setup_path):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"val",
[
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/io/pytables/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs.timezones import maybe_get_tz
import pandas.util._test_decorators as td

Expand All @@ -25,9 +23,9 @@
ensure_clean_store,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)
# pytestmark = pytest.mark.xfail(
# using_string_dtype(), reason="TODO(infer_string)", strict=False
# )


def _compare_with_tz(a, b):
Expand Down
35 changes: 30 additions & 5 deletions pandas/tests/reshape/test_from_dummies.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Series,
Expand Down Expand Up @@ -336,8 +335,6 @@ def test_no_prefix_string_cats_default_category(
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
result = from_dummies(dummies, default_category=default_category)
expected = DataFrame(expected)
if using_infer_string:
expected[""] = expected[""].astype("str")
tm.assert_frame_equal(result, expected)


Expand All @@ -364,7 +361,6 @@ def test_with_prefix_contains_get_dummies_NaN_column():
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"default_category, expected",
[
Expand Down Expand Up @@ -450,3 +446,32 @@ def test_maintain_original_index():
result = from_dummies(df)
expected = DataFrame({"": list("abca")}, index=list("abcd"))
tm.assert_frame_equal(result, expected)


def test_int_columns_with_float_default():
# https://github.com/pandas-dev/pandas/pull/60694
df = DataFrame(
{
3: [1, 0, 0],
4: [0, 1, 0],
},
)
with pytest.raises(ValueError, match="Trying to coerce float values to integers"):
from_dummies(df, default_category=0.5)


def test_object_dtype_preserved():
# https://github.com/pandas-dev/pandas/pull/60694
# When the input has object dtype, the result should as
# well even when infer_string is True.
df = DataFrame(
{
"x": [1, 0, 0],
"y": [0, 1, 0],
},
)
df.columns = df.columns.astype("object")
with pd.option_context("future.infer_string", True):
result = from_dummies(df, default_category="z")
expected = DataFrame({"": ["x", "y", "z"]}, dtype="object")
tm.assert_frame_equal(result, expected)
Loading