From 7deee5707e437bcde9dd622374834f84565dd8f0 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Mon, 1 Dec 2025 21:07:44 +0530 Subject: [PATCH 1/5] mypy fixes --- xarray/core/dataarray.py | 12 ++++++++++-- xarray/core/dataset.py | 23 +++++++++++++++++++---- xarray/tests/test_dataarray.py | 26 ++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 6c8d0617038..f8a42c53faf 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame: return pandas_object def to_dataframe( - self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None + self, + name: Hashable | None = None, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, ) -> pd.DataFrame: """Convert this array and its coordinates into a tidy pandas.DataFrame. @@ -3979,6 +3982,11 @@ def to_dataframe( If provided, must include all dimensions of this DataArray. By default, dimensions are sorted according to the DataArray dimensions order. + create_index : bool, default: True + If True (default), create a MultiIndex from the Cartesian product + of this DataArray's indices. If False, use a RangeIndex instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -4013,7 +4021,7 @@ def to_dataframe( else: ordered_dims = ds._normalize_dim_order(dim_order=dim_order) - df = ds._to_dataframe(ordered_dims) + df = ds._to_dataframe(ordered_dims, create_index=create_index) df.columns = [name if c == unique_name else c for c in df.columns] return df diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9c2c2f60db1..84d5a26ba8d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame: "Please use Dataset.to_dataframe() instead." ) - def _to_dataframe(self, ordered_dims: Mapping[Any, int]): + def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True): from xarray.core.extension_array import PandasExtensionArray # All and only non-index arrays (whether data or coordinates) should @@ -7231,7 +7231,13 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): self._variables[k].set_dims(ordered_dims).values.reshape(-1) for k in non_extension_array_columns ] - index = self.coords.to_index([*ordered_dims]) + if create_index: + index = self.coords.to_index([*ordered_dims]) + else: + # Use a simple RangeIndex when create_index=False + # Calculate the total size from ordered_dims + total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0 + index = pd.RangeIndex(total_size) broadcasted_df = pd.DataFrame( { **dict(zip(non_extension_array_columns, data, strict=True)), @@ -7259,7 +7265,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): broadcasted_df = broadcasted_df.join(extension_array_df) return broadcasted_df[columns_in_order] - def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame: + def to_dataframe( + self, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, + ) -> pd.DataFrame: """Convert this dataset into a pandas.DataFrame. Non-index variables in this dataset form the columns of the @@ -7278,6 +7288,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr If provided, must include all dimensions of this dataset. By default, dimensions are in the same order as in `Dataset.sizes`. + create_index : bool, default: True + If True (default), create a MultiIndex from the Cartesian product + of this dataset's indices. If False, use a RangeIndex instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -7288,7 +7303,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr ordered_dims = self._normalize_dim_order(dim_order=dim_order) - return self._to_dataframe(ordered_dims=ordered_dims) + return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index) def _set_sparse_data_from_dataframe( self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5eec7b8a2fd..2cf75fb58e2 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3553,6 +3553,32 @@ def test_to_dataframe_0length(self) -> None: assert len(actual) == 0 assert_array_equal(actual.index.names, list("ABC")) + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter + arr_np = np.arange(12).reshape(3, 4) + arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo") + + # Default behavior: create MultiIndex + df_with_index = arr.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = arr.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values) + + # Test with coords that have different dimensions + arr.coords["z"] = ("x", [-1, -2, -3]) + df_with_coords = arr.to_dataframe(create_index=False) + assert isinstance(df_with_coords.index, pd.RangeIndex) + assert "z" in df_with_coords.columns + assert len(df_with_coords) == 12 + @pytest.mark.parametrize( "x_dtype,y_dtype,v_dtype", [ diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e677430dfbf..e30fa28bbbb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2259,6 +2259,38 @@ def test_to_pandas(self) -> None: with pytest.raises(ValueError, match=r"cannot convert Datasets"): Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas() + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter for Dataset + x = np.random.randn(3, 4) + y = np.random.randn(3, 4) + ds = Dataset( + {"a": (("x", "y"), x), "b": (("x", "y"), y)}, + coords={"x": [1, 2, 3], "y": list("abcd")}, + ) + + # Default behavior: create MultiIndex + df_with_index = ds.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = ds.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["a"].values, df_without_index["a"].values) + assert_array_equal(df_with_index["b"].values, df_without_index["b"].values) + + # Test with dim_order and create_index=False + df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False) + assert isinstance(df_reordered.index, pd.RangeIndex) + assert len(df_reordered) == 12 + # Check that dim_order affects the data ordering + df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"]) + assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values) + def test_reindex_like(self) -> None: data = create_test_data() data["letters"] = ("dim3", 10 * ["a"]) From 139a19b73dc8720fb75128c577a1989167b99071 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Thu, 4 Dec 2025 03:29:44 +0530 Subject: [PATCH 2/5] ruff --- xarray/core/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5ba895d4bb0..3c02ef293f5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7236,7 +7236,9 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = Tr else: # Use a simple RangeIndex when create_index=False # Calculate the total size from ordered_dims - total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0 + total_size = ( + int(np.prod(list(ordered_dims.values()))) if ordered_dims else 0 + ) index = pd.RangeIndex(total_size) broadcasted_df = pd.DataFrame( { From 843cf7d1311ea4928a54cb5df945386a22c447bc Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Sun, 28 Dec 2025 00:25:00 +0530 Subject: [PATCH 3/5] minor fixes --- doc/whats-new.rst | 7 ++++ xarray/core/dataarray.py | 14 ++++++-- xarray/core/dataset.py | 31 +++++++++++++---- xarray/tests/test_dask.py | 63 ++++++++++++++++++++++++++++++++++ xarray/tests/test_dataarray.py | 54 +++++++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 23af750060c..1f97508e1b2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,13 @@ New Features (:pull:`10849`). By `Stephan Hoyer `_. +- Added ``create_index`` parameter to :py:meth:`Dataset.to_dataframe`, :py:meth:`DataArray.to_dataframe`, + :py:meth:`Dataset.to_dask_dataframe`, and :py:meth:`DataArray.to_dask_dataframe` methods. + When ``create_index=False``, the resulting DataFrame will use a :py:class:`pandas.RangeIndex` + instead of setting dimension coordinates as the index, which can significantly improve performance + when the default multi-index is not needed. + By `Sanjay Kumar `_. + Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c26a3cdf176..132394fa9ad 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3983,8 +3983,8 @@ def to_dataframe( If provided, must include all dimensions of this DataArray. By default, dimensions are sorted according to the DataArray dimensions order. create_index : bool, default: True - If True (default), create a MultiIndex from the Cartesian product - of this DataArray's indices. If False, use a RangeIndex instead. + If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product + of this DataArray's indices. If False, use a :py:class:`pandas.RangeIndex` instead. This can be useful to avoid the potentially expensive MultiIndex creation. @@ -7587,6 +7587,7 @@ def to_dask_dataframe( self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False, + create_index: bool = True, ) -> DaskDataFrame: """Convert this array into a dask.dataframe.DataFrame. @@ -7602,6 +7603,13 @@ def to_dask_dataframe( If set_index=True, the dask DataFrame is indexed by this dataset's coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. + create_index : bool, default: True + If ``create_index=False``, the resulting DataFrame will use a + :py:class:`pandas.RangeIndex` instead of setting dimensions as index columns. + This can significantly improve performance when the default index is not needed. + ``create_index=False`` is incompatible with ``set_index=True``. + + .. versionadded:: 2025.01.1 Returns ------- @@ -7646,7 +7654,7 @@ def to_dask_dataframe( ) name = self.name ds = self._to_dataset_whole(name, shallow_copy=False) - return ds.to_dask_dataframe(dim_order, set_index) + return ds.to_dask_dataframe(dim_order, set_index, create_index) # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3c02ef293f5..9fe6b50fb52 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7291,8 +7291,8 @@ def to_dataframe( If provided, must include all dimensions of this dataset. By default, dimensions are in the same order as in `Dataset.sizes`. create_index : bool, default: True - If True (default), create a MultiIndex from the Cartesian product - of this dataset's indices. If False, use a RangeIndex instead. + If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product + of this dataset's indices. If False, use a :py:class:`pandas.RangeIndex` instead. This can be useful to avoid the potentially expensive MultiIndex creation. @@ -7463,7 +7463,10 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self: return obj[dataframe.columns] if len(dataframe.columns) else obj def to_dask_dataframe( - self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False + self, + dim_order: Sequence[Hashable] | None = None, + set_index: bool = False, + create_index: bool = True, ) -> DaskDataFrame: """ Convert this dataset into a dask.dataframe.DataFrame. @@ -7487,6 +7490,13 @@ def to_dask_dataframe( If set_index=True, the dask DataFrame is indexed by this dataset's coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. + create_index : bool, default: True + If ``create_index=False``, the resulting DataFrame will use a + :py:class:`pandas.RangeIndex` instead of setting dimensions as index columns. + This can significantly improve performance when the default index is not needed. + ``create_index=False`` is incompatible with ``set_index=True``. + + .. versionadded:: 2025.01.1 Returns ------- @@ -7496,11 +7506,20 @@ def to_dask_dataframe( import dask.array as da import dask.dataframe as dd + if not create_index and set_index: + raise ValueError("create_index=False is incompatible with set_index=True") + ordered_dims = self._normalize_dim_order(dim_order=dim_order) - columns = list(ordered_dims) - columns.extend(k for k in self.coords if k not in self.dims) - columns.extend(self.data_vars) + if create_index: + columns = list(ordered_dims) + columns.extend(k for k in self.coords if k not in self.dims) + columns.extend(self.data_vars) + else: + # When create_index=False, exclude dimensions from columns + columns = [] + columns.extend(k for k in self.coords if k not in self.dims) + columns.extend(self.data_vars) ds_chunks = self.chunks diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 2d103994410..d55d5fa5143 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -941,6 +941,69 @@ def test_to_dask_dataframe_dim_order(self): with pytest.raises(ValueError, match=r"does not match the set of dimensions"): ds.to_dask_dataframe(dim_order=["x"]) + def test_to_dask_dataframe_create_index_false(self): + # Test that create_index=False uses RangeIndex instead of dimension columns + x = np.random.randn(10) + y = np.arange(10, dtype="uint8") + t = list("abcdefghij") + + ds = Dataset( + {"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)} + ) + + # With create_index=False, we should get a RangeIndex and no dimension columns + actual = ds.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + # Check that index is RangeIndex + assert isinstance(actual_computed.index, pd.RangeIndex) + + # Check that dimension columns are not present + assert "t" not in actual_computed.columns + + # Check that data columns are present + assert "a" in actual_computed.columns + assert "b" in actual_computed.columns + + # Verify values are correct + assert_array_equal(actual_computed["a"].values, x) + assert_array_equal(actual_computed["b"].values, y) + + def test_to_dask_dataframe_create_index_incompatible_with_set_index(self): + # Test that create_index=False and set_index=True raises an error + ds = Dataset({"a": ("t", da.from_array([1, 2, 3], chunks=2))}) + + with pytest.raises( + ValueError, + match="create_index=False is incompatible with set_index=True", + ): + ds.to_dask_dataframe(create_index=False, set_index=True) + + def test_to_dask_dataframe_create_index_2D(self): + # Test create_index=False with 2D dataset + w = np.random.randn(2, 3) + ds = Dataset({"w": (("x", "y"), da.from_array(w, chunks=(1, 2)))}) + ds["x"] = ("x", np.array([0, 1], np.int64)) + ds["y"] = ("y", list("abc")) + + actual = ds.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + # Check that index is RangeIndex + assert isinstance(actual_computed.index, pd.RangeIndex) + + # Check that dimension columns are not present + assert "x" not in actual_computed.columns + assert "y" not in actual_computed.columns + + # Check that data column is present + assert "w" in actual_computed.columns + + # Verify values are correct (flattened) + assert_array_equal(actual_computed["w"].values, w.reshape(-1)) + @pytest.mark.parametrize("method", ["load", "compute"]) def test_dask_kwargs_variable(method): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 872da934423..bb8568a90df 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3746,6 +3746,60 @@ def test_to_dask_dataframe(self) -> None: ): arr.to_dask_dataframe() + def test_to_dask_dataframe_create_index(self) -> None: + # Test create_index parameter for to_dask_dataframe + arr_np = np.arange(3 * 4).reshape(3, 4) + arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo") + + # With create_index=False, should use RangeIndex + actual = arr.to_dask_dataframe(create_index=False) + actual_computed = actual.compute() + + assert isinstance(actual_computed.index, pd.RangeIndex) + assert "B" not in actual_computed.columns + assert "A" not in actual_computed.columns + assert "foo" in actual_computed.columns + assert_array_equal(actual_computed["foo"].values, arr_np.reshape(-1)) + + # Test incompatibility with set_index=True + with pytest.raises( + ValueError, + match="create_index=False is incompatible with set_index=True", + ): + arr.to_dask_dataframe(create_index=False, set_index=True) + arr_np = np.arange(3 * 4).reshape(3, 4) + arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo") + expected_s = arr.to_series() + actual = arr.to_dask_dataframe()["foo"] + + assert_array_equal(actual.values, np.asarray(expected_s.values)) + + actual = arr.to_dask_dataframe(dim_order=["A", "B"])["foo"] + assert_array_equal(arr_np.transpose().reshape(-1), actual.values) + + # regression test for coords with different dimensions + + arr.coords["C"] = ("B", [-1, -2, -3]) + expected_df = arr.to_series().to_frame() + expected_df["C"] = [-1] * 4 + [-2] * 4 + [-3] * 4 + expected_df = expected_df[["C", "foo"]] + actual = arr.to_dask_dataframe()[["C", "foo"]] + + assert_array_equal(expected_df.values, np.asarray(actual.values)) + assert_array_equal( + expected_df.columns.values, np.asarray(actual.columns.values) + ) + + with pytest.raises(ValueError, match="does not match the set of dimensions"): + arr.to_dask_dataframe(dim_order=["B", "A", "C"]) + + arr.name = None + with pytest.raises( + ValueError, + match="Cannot convert an unnamed DataArray", + ): + arr.to_dask_dataframe() + def test_to_pandas_name_matches_coordinate(self) -> None: # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x") From 4bde825de62e7feabc0c639c488dc6ff7fd6d6c3 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Sun, 28 Dec 2025 00:36:05 +0530 Subject: [PATCH 4/5] tests added --- xarray/tests/test_dask.py | 27 +++++++++++++++++ xarray/tests/test_dataarray.py | 54 ---------------------------------- 2 files changed, 27 insertions(+), 54 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index d55d5fa5143..f3018e0eb66 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1004,6 +1004,33 @@ def test_to_dask_dataframe_create_index_2D(self): # Verify values are correct (flattened) assert_array_equal(actual_computed["w"].values, w.reshape(-1)) + def test_to_dask_dataframe_create_index_dataarray(self): + # Test create_index parameter for DataArray.to_dask_dataframe + arr_np = np.arange(3 * 4).reshape(3, 4) + arr = DataArray( + da.from_array(arr_np, chunks=(2, 2)), + [("B", [1, 2, 3]), ("A", list("cdef"))], + name="foo", + ) + + # With create_index=False, should use RangeIndex + actual = arr.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + assert isinstance(actual_computed.index, pd.RangeIndex) + assert "B" not in actual_computed.columns + assert "A" not in actual_computed.columns + assert "foo" in actual_computed.columns + assert_array_equal(actual_computed["foo"].values, arr_np.reshape(-1)) + + # Test incompatibility with set_index=True + with pytest.raises( + ValueError, + match="create_index=False is incompatible with set_index=True", + ): + arr.to_dask_dataframe(create_index=False, set_index=True) + @pytest.mark.parametrize("method", ["load", "compute"]) def test_dask_kwargs_variable(method): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bb8568a90df..872da934423 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3746,60 +3746,6 @@ def test_to_dask_dataframe(self) -> None: ): arr.to_dask_dataframe() - def test_to_dask_dataframe_create_index(self) -> None: - # Test create_index parameter for to_dask_dataframe - arr_np = np.arange(3 * 4).reshape(3, 4) - arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo") - - # With create_index=False, should use RangeIndex - actual = arr.to_dask_dataframe(create_index=False) - actual_computed = actual.compute() - - assert isinstance(actual_computed.index, pd.RangeIndex) - assert "B" not in actual_computed.columns - assert "A" not in actual_computed.columns - assert "foo" in actual_computed.columns - assert_array_equal(actual_computed["foo"].values, arr_np.reshape(-1)) - - # Test incompatibility with set_index=True - with pytest.raises( - ValueError, - match="create_index=False is incompatible with set_index=True", - ): - arr.to_dask_dataframe(create_index=False, set_index=True) - arr_np = np.arange(3 * 4).reshape(3, 4) - arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo") - expected_s = arr.to_series() - actual = arr.to_dask_dataframe()["foo"] - - assert_array_equal(actual.values, np.asarray(expected_s.values)) - - actual = arr.to_dask_dataframe(dim_order=["A", "B"])["foo"] - assert_array_equal(arr_np.transpose().reshape(-1), actual.values) - - # regression test for coords with different dimensions - - arr.coords["C"] = ("B", [-1, -2, -3]) - expected_df = arr.to_series().to_frame() - expected_df["C"] = [-1] * 4 + [-2] * 4 + [-3] * 4 - expected_df = expected_df[["C", "foo"]] - actual = arr.to_dask_dataframe()[["C", "foo"]] - - assert_array_equal(expected_df.values, np.asarray(actual.values)) - assert_array_equal( - expected_df.columns.values, np.asarray(actual.columns.values) - ) - - with pytest.raises(ValueError, match="does not match the set of dimensions"): - arr.to_dask_dataframe(dim_order=["B", "A", "C"]) - - arr.name = None - with pytest.raises( - ValueError, - match="Cannot convert an unnamed DataArray", - ): - arr.to_dask_dataframe() - def test_to_pandas_name_matches_coordinate(self) -> None: # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x") From 8bfee0db268ddb85cc70df3084e6418ecda1ded3 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Mon, 29 Dec 2025 11:00:46 +0530 Subject: [PATCH 5/5] dimensions & index resolved --- xarray/core/dataarray.py | 9 +++++---- xarray/core/dataset.py | 21 ++++++++++++--------- xarray/tests/test_dask.py | 6 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 132394fa9ad..cfe8e6f6de8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -7604,10 +7604,11 @@ def to_dask_dataframe( coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. create_index : bool, default: True - If ``create_index=False``, the resulting DataFrame will use a - :py:class:`pandas.RangeIndex` instead of setting dimensions as index columns. - This can significantly improve performance when the default index is not needed. - ``create_index=False`` is incompatible with ``set_index=True``. + If ``create_index=True`` (default), dimension coordinates will be included + as columns in the resulting DataFrame. If ``create_index=False``, dimension + coordinates will be excluded, leaving only data variables and non-dimension + coordinates. This can improve performance and reduce memory usage when dimension + information is not needed. ``create_index=False`` is incompatible with ``set_index=True``. .. versionadded:: 2025.01.1 diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9fe6b50fb52..0a15cf84ee5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7491,10 +7491,11 @@ def to_dask_dataframe( coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. create_index : bool, default: True - If ``create_index=False``, the resulting DataFrame will use a - :py:class:`pandas.RangeIndex` instead of setting dimensions as index columns. - This can significantly improve performance when the default index is not needed. - ``create_index=False`` is incompatible with ``set_index=True``. + If ``create_index=True`` (default), dimension coordinates will be included + as columns in the resulting DataFrame. If ``create_index=False``, dimension + coordinates will be excluded, leaving only data variables and non-dimension + coordinates. This can improve performance and reduce memory usage when dimension + information is not needed. ``create_index=False`` is incompatible with ``set_index=True``. .. versionadded:: 2025.01.1 @@ -7511,15 +7512,17 @@ def to_dask_dataframe( ordered_dims = self._normalize_dim_order(dim_order=dim_order) + # Build column list based on create_index if create_index: + # Include dimension coordinates as columns columns = list(ordered_dims) - columns.extend(k for k in self.coords if k not in self.dims) - columns.extend(self.data_vars) else: - # When create_index=False, exclude dimensions from columns + # Exclude dimension coordinates columns = [] - columns.extend(k for k in self.coords if k not in self.dims) - columns.extend(self.data_vars) + + # Always include non-dimension coordinates and data variables + columns.extend(k for k in self.coords if k not in self.dims) + columns.extend(self.data_vars) ds_chunks = self.chunks diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index f3018e0eb66..21d6f8249aa 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -942,7 +942,7 @@ def test_to_dask_dataframe_dim_order(self): ds.to_dask_dataframe(dim_order=["x"]) def test_to_dask_dataframe_create_index_false(self): - # Test that create_index=False uses RangeIndex instead of dimension columns + # Test that create_index=False excludes dimension columns x = np.random.randn(10) y = np.arange(10, dtype="uint8") t = list("abcdefghij") @@ -951,7 +951,7 @@ def test_to_dask_dataframe_create_index_false(self): {"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)} ) - # With create_index=False, we should get a RangeIndex and no dimension columns + # With create_index=False, dimension columns should be excluded actual = ds.to_dask_dataframe(create_index=False) assert isinstance(actual, dd.DataFrame) actual_computed = actual.compute() @@ -959,7 +959,7 @@ def test_to_dask_dataframe_create_index_false(self): # Check that index is RangeIndex assert isinstance(actual_computed.index, pd.RangeIndex) - # Check that dimension columns are not present + # Check that dimension columns are NOT present assert "t" not in actual_computed.columns # Check that data columns are present