Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
68d5c73
Support NETCDF4_CLASSIC in the h5engine backend
huard Sep 2, 2025
120af07
convert bytes attributes to numpy.bytes_ in NETCDF4_CLASSIC format wi…
huard Sep 2, 2025
f8f44f0
added test to confirm string attributes are stored as numpy char arra…
huard Sep 2, 2025
522d37d
Added change to whats-new
huard Sep 3, 2025
783c407
run pre-commit
huard Sep 3, 2025
2f1c781
Added test comparing CDL representation of test data written with net…
huard Sep 3, 2025
b142d38
Added global attribute to test data. Apply CLASSIC conversion to vari…
huard Sep 3, 2025
b14c373
Merge branch 'main' into fix_10676
huard Sep 3, 2025
909a96c
Use h5dump to compare file content instead of CDL. Add _nc3_strict at…
huard Sep 3, 2025
14d22a1
Merge branch 'fix_10676' of github.com:Ouranosinc/xarray into fix_10676
huard Sep 3, 2025
35b50ce
raise error if writing groups to CLASSIC file.
huard Sep 3, 2025
a187c8d
remove h5dump test. Remove _nc3_strict attribute (should go into h5ne…
huard Sep 5, 2025
0c1bc08
fix h5netcdf version check.
huard Sep 8, 2025
10707ee
Merge branch 'main' into fix_10676
huard Sep 8, 2025
03ea2de
try to fix tests
huard Sep 8, 2025
592b98b
Set default format to NETCDF4 instead of None, because passing None t…
huard Sep 8, 2025
13b60d0
Apply suggestions from code review
huard Sep 9, 2025
cf8b4be
Suggestions from review.
huard Sep 9, 2025
d0b0948
Merge branch 'fix_10676' of github.com:Ouranosinc/xarray into fix_10676
huard Sep 9, 2025
6351e66
Merge branch 'main' into fix_10676
huard Sep 22, 2025
b2a21d2
Raise an error within `get_child_store` rather that `__init__` with f…
huard Sep 22, 2025
50dbd36
Merge branch 'main' into fix_10676
huard Sep 23, 2025
6d686d4
Merge branch 'main' into fix_10676
huard Oct 2, 2025
18e4bd0
Merge branch 'main' into fix_10676
kmuehlbauer Oct 14, 2025
d9c4879
Remove casting of bytes to np._bytes_. Require h5netcdf 1.7.0 to save…
huard Oct 14, 2025
99c90a5
Merge branch 'main' into fix_10676
kmuehlbauer Oct 15, 2025
28bd354
Apply suggestions from code review
huard Oct 15, 2025
f2a29ca
Update xarray/backends/h5netcdf_.py
huard Oct 15, 2025
f1c1819
Update xarray/backends/h5netcdf_.py
huard Oct 15, 2025
88d0841
Merge branch 'main' into fix_10676
kmuehlbauer Oct 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ New Features
- ``compute=False`` is now supported by :py:meth:`DataTree.to_netcdf` and
:py:meth:`DataTree.to_zarr`.
By `Stephan Hoyer <https://github.com/shoyer>`_.
- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support. (:issue:`10676`, :pull:`10686`).
By `David Huard <https://github.com/huard>`_.
- ``open_dataset`` will now correctly infer a path ending in ``.zarr/`` as zarr
By `Ian Hunt-Isaak <https://github.com/ianhi>`_.

Expand Down
56 changes: 45 additions & 11 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import TYPE_CHECKING, Any, Self

import numpy as np
from packaging.version import Version

from xarray.backends.common import (
BACKEND_ENTRYPOINTS,
Expand All @@ -27,6 +28,7 @@
PickleableFileManager,
)
from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock
from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable
from xarray.backends.netCDF4_ import (
BaseNetCDF4Array,
_build_and_get_enum,
Expand Down Expand Up @@ -124,6 +126,7 @@ def __init__(
manager: FileManager | h5netcdf.File | h5netcdf.Group,
group=None,
mode=None,
format="NETCDF4",
lock=HDF5_LOCK,
autoclose=False,
):
Expand All @@ -143,7 +146,7 @@ def __init__(
self._manager = manager
self._group = group
self._mode = mode
self.format = None
self.format = format or "NETCDF4"
# todo: utilizing find_root_and_group seems a bit clunky
# making filename available on h5netcdf.Group seems better
self._filename = find_root_and_group(self.ds)[0].filename
Expand All @@ -152,6 +155,9 @@ def __init__(
self.autoclose = autoclose

def get_child_store(self, group: str) -> Self:
if self.format == "NETCDF4_CLASSIC":
raise ValueError("Cannot create sub-groups in `NETCDF4_CLASSIC` format.")

if self._group is not None:
group = os.path.join(self._group, group)
return type(self)(
Expand All @@ -167,7 +173,7 @@ def open(
cls,
filename,
mode="r",
format=None,
format="NETCDF4",
group=None,
lock=None,
autoclose=False,
Expand Down Expand Up @@ -198,8 +204,8 @@ def open(
f"{magic_number!r} is not the signature of a valid netCDF4 file"
)

if format not in [None, "NETCDF4"]:
raise ValueError("invalid format for h5netcdf backend")
if format not in [None, "NETCDF4", "NETCDF4_CLASSIC"]:
raise ValueError(f"invalid format for h5netcdf backend: {format}")

kwargs = {
"invalid_netcdf": invalid_netcdf,
Expand All @@ -210,6 +216,8 @@ def open(
kwargs.update(driver_kwds)
if phony_dims is not None:
kwargs["phony_dims"] = phony_dims
if format is not None and Version(h5netcdf.__version__) > Version("1.6.4"):
kwargs["format"] = format

if lock is None:
if mode == "r":
Expand All @@ -223,7 +231,15 @@ def open(
else PickleableFileManager
)
manager = manager_cls(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)

return cls(
manager,
group=group,
format=format,
mode=mode,
lock=lock,
autoclose=autoclose,
)

def _acquire(self, needs_lock=True):
with self._manager.acquire_context(needs_lock) as root:
Expand Down Expand Up @@ -319,11 +335,27 @@ def set_dimension(self, name, length, is_unlimited=False):
else:
self.ds.dimensions[name] = length

def convert_string(self, value):
"""If format is NETCDF4_CLASSIC, convert strings to fixed width char
arrays to ensure they can be read by legacy software.

CLASSIC attributes are read by third party software as fixed width char arrays
"""
if self.format == "NETCDF4_CLASSIC":
value = encode_nc3_attr_value(value)
if isinstance(value, bytes):
value = np.bytes_(value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this special logic only for converting bytes? This seems unrelated to what we need for NETCDF4_CLASSIC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure strings are written as NC_CHAR, and not NC_STRING. See https://engee.com/helpcenter/stable/en/julia/NetCDF/strings.html

This is in fact the detail that our third party software in C++ choked on. The netCDF C library has both nc_get_att_text and nc_get_att_string functions. Calling nc_get_att_text on an NC_STRING raises an error.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huard I'm just going over this again. I'm on board with @shoyer here.

Even without this addition:

    if isinstance(value, bytes):
        value = np.bytes_(value)
ds = xr.Dataset(
    data_vars=dict(temp=("x", [1, 2, 3])),
    coords=dict(x=[0, 1, 2]),
    attrs=dict(
        plain_bytes=b"hello",
        numpy_bytes=np.bytes_(b"hello"),
    ),
)

encodes properly for both engines to fixed size strings (aka NC_CHAR)

ATTRIBUTE "numpy_bytes" {
      DATATYPE  H5T_STRING {
         STRSIZE 5;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SCALAR
      DATA {
      (0): "hello"
      }
   }
   ATTRIBUTE "plain_bytes" {
      DATATYPE  H5T_STRING {
         STRSIZE 5;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SCALAR
      DATA {
      (0): "hello"
      }
   }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think removing the helper function and just conditionally running encode_nc3_attr_value(value) should be enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I remove the cast and test with h5netcdf v1.6.4, plain_bytes is saved as a variable length string.

Is the plan to pin h5netcdf >=1.7 for the next xarray release?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huard, you would need to test against h5netcdf main. Isn't the check for > 1.6.4?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, h5netcdf is not pinned at all. But we have the version check in place. So all good, or do I miss something .

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, h5netcdf is not pinned at all. But we have the version check in place. So all good, or do I miss something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My objective was to have some basic CLASSIC functionality working with older h5netcdf releases. If we keep this line in, xarray is able to save CLASSIC "passing" files even without the h5netcdf's main.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xarray (this branch) / h5netcdf 1.6.4 -> CLASSIC files won't be recognized as such by the netCDF library, but there is a fair chance 3rd party applications won't choke.

xarray (this branch minus the bytes_ cast) / h5netcdf 1.6.4 -> 3rd party apps likely to crash when reading attributes.

xarray (this branch minus the bytes_ cast) / h5netcdf 1.7.0 -> Fully compliant NETCDF4_CLASSIC format

return value

def set_attribute(self, key, value):
value = self.convert_string(value)
self.ds.attrs[key] = value

def encode_variable(self, variable, name=None):
return _encode_nc4_variable(variable, name=name)
if self.format == "NETCDF4_CLASSIC":
return encode_nc3_variable(variable, name=name)
else:
return _encode_nc4_variable(variable, name=name)

def prepare_variable(
self, name, variable, check_encoding=False, unlimited_dims=None
Expand All @@ -332,7 +364,9 @@ def prepare_variable(

_ensure_no_forward_slash_in_name(name)
attrs = variable.attrs.copy()
dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)
dtype = _get_datatype(
variable, nc_format=self.format, raise_on_invalid_encoding=check_encoding
)

fillvalue = attrs.pop("_FillValue", None)

Expand Down Expand Up @@ -394,7 +428,7 @@ def prepare_variable(
nc4_var = self.ds[name]

for k, v in attrs.items():
nc4_var.attrs[k] = v
nc4_var.attrs[k] = self.convert_string(v)

target = H5NetCDFArrayWrapper(name, self)

Expand Down Expand Up @@ -484,7 +518,7 @@ def open_dataset(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group=None,
lock=None,
invalid_netcdf=None,
Expand Down Expand Up @@ -544,7 +578,7 @@ def open_datatree(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group: str | None = None,
lock=None,
invalid_netcdf=None,
Expand Down Expand Up @@ -587,7 +621,7 @@ def open_groups_as_dict(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group: str | None = None,
lock=None,
invalid_netcdf=None,
Expand Down
49 changes: 49 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ def roundtrip(
save_kwargs = {}
if open_kwargs is None:
open_kwargs = {}

with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path:
self.save(data, path, **save_kwargs)
with self.open(path, **open_kwargs) as ds:
Expand Down Expand Up @@ -4735,6 +4736,54 @@ def create_store(self):
) as store:
yield store

@requires_h5netcdf
def test_string_attributes_stored_as_char(self, tmp_path):
import h5netcdf

original = Dataset(attrs={"foo": "bar"})
store_path = tmp_path / "tmp.nc"
original.to_netcdf(store_path, engine=self.engine, format=self.file_format)
with h5netcdf.File(store_path, "r") as ds:
# Check that the attribute is stored as a char array
assert ds._h5file.attrs["foo"].dtype == np.dtype("S3")
Comment on lines +4742 to +4750
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NumPy's S dtype actually corresponds to bytes, not str. I don't think we want to use it for storing attributes in general.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using fixed width chars replicates the behavior of the netCDF4 backend for the CLASSIC format. Again, this has to do with the NC_CHAR vs NC_STRING formats.

Sticking as close as possible to netCDF4 output increases my confidence that the h5netcdf outputs will be compatible with 3rd party software expecting the CLASSIC format.



@requires_h5netcdf
class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data):
engine: T_NetcdfEngine = "h5netcdf"
file_format: T_NetcdfTypes = "NETCDF4_CLASSIC"

@contextlib.contextmanager
def create_store(self):
with create_tmp_file() as tmp_file:
with backends.H5NetCDFStore.open(
tmp_file, mode="w", format="NETCDF4_CLASSIC"
) as store:
yield store

@requires_netCDF4
def test_cross_engine_read_write_netcdf4(self) -> None:
# Drop dim3, because its labels include strings. These appear to be
# not properly read with python-netCDF4, which converts them into
# unicode instead of leaving them as bytes.
data = create_test_data().drop_vars("dim3")
data.attrs["foo"] = "bar"
valid_engines: list[T_NetcdfEngine] = ["netcdf4", "h5netcdf"]
for write_engine in valid_engines:
with create_tmp_file() as tmp_file:
data.to_netcdf(tmp_file, engine=write_engine, format=self.file_format)
for read_engine in valid_engines:
with open_dataset(tmp_file, engine=read_engine) as actual:
assert_identical(data, actual)

def test_group_fails(self):
# Check writing group data fails with CLASSIC format
original = create_test_data()
with pytest.raises(
ValueError, match=r"Cannot create sub-groups in `NETCDF4_CLASSIC` format."
):
original.to_netcdf(group="sub", format=self.file_format, engine=self.engine)


@requires_scipy_or_netCDF4
class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase):
Expand Down
Loading