diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3f59657c0b6..b981ae9427c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -285,6 +285,8 @@ New Features - ``compute=False`` is now supported by :py:meth:`DataTree.to_netcdf` and :py:meth:`DataTree.to_zarr`. By `Stephan Hoyer `_. +- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support. (:issue:`10676`, :pull:`10686`). + By `David Huard `_. - ``open_dataset`` will now correctly infer a path ending in ``.zarr/`` as zarr By `Ian Hunt-Isaak `_. diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 28565f92de9..78719960f63 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Self import numpy as np +from packaging.version import Version from xarray.backends.common import ( BACKEND_ENTRYPOINTS, @@ -27,6 +28,7 @@ PickleableFileManager, ) from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock +from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable from xarray.backends.netCDF4_ import ( BaseNetCDF4Array, _build_and_get_enum, @@ -124,6 +126,7 @@ def __init__( manager: FileManager | h5netcdf.File | h5netcdf.Group, group=None, mode=None, + format="NETCDF4", lock=HDF5_LOCK, autoclose=False, ): @@ -143,7 +146,7 @@ def __init__( self._manager = manager self._group = group self._mode = mode - self.format = None + self.format = format or "NETCDF4" # todo: utilizing find_root_and_group seems a bit clunky # making filename available on h5netcdf.Group seems better self._filename = find_root_and_group(self.ds)[0].filename @@ -152,6 +155,9 @@ def __init__( self.autoclose = autoclose def get_child_store(self, group: str) -> Self: + if self.format == "NETCDF4_CLASSIC": + raise ValueError("Cannot create sub-groups in `NETCDF4_CLASSIC` format.") + if self._group is not None: group = os.path.join(self._group, group) return type(self)( @@ -167,7 +173,7 @@ def open( cls, filename, mode="r", - format=None, + format="NETCDF4", group=None, lock=None, autoclose=False, @@ -198,8 +204,8 @@ def open( f"{magic_number!r} is not the signature of a valid netCDF4 file" ) - if format not in [None, "NETCDF4"]: - raise ValueError("invalid format for h5netcdf backend") + if format not in [None, "NETCDF4", "NETCDF4_CLASSIC"]: + raise ValueError(f"invalid format for h5netcdf backend: {format}") kwargs = { "invalid_netcdf": invalid_netcdf, @@ -210,6 +216,12 @@ def open( kwargs.update(driver_kwds) if phony_dims is not None: kwargs["phony_dims"] = phony_dims + if Version(h5netcdf.__version__) > Version("1.6.4"): + kwargs["format"] = format + elif format == "NETCDF4_CLASSIC": + raise ValueError( + "h5netcdf >= 1.7.0 is required to save output in NETCDF4_CLASSIC format." + ) if lock is None: if mode == "r": @@ -223,7 +235,15 @@ def open( else PickleableFileManager ) manager = manager_cls(h5netcdf.File, filename, mode=mode, kwargs=kwargs) - return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) + + return cls( + manager, + group=group, + format=format, + mode=mode, + lock=lock, + autoclose=autoclose, + ) def _acquire(self, needs_lock=True): with self._manager.acquire_context(needs_lock) as root: @@ -320,10 +340,15 @@ def set_dimension(self, name, length, is_unlimited=False): self.ds.dimensions[name] = length def set_attribute(self, key, value): + if self.format == "NETCDF4_CLASSIC": + value = encode_nc3_attr_value(value) self.ds.attrs[key] = value def encode_variable(self, variable, name=None): - return _encode_nc4_variable(variable, name=name) + if self.format == "NETCDF4_CLASSIC": + return encode_nc3_variable(variable, name=name) + else: + return _encode_nc4_variable(variable, name=name) def prepare_variable( self, name, variable, check_encoding=False, unlimited_dims=None @@ -332,7 +357,9 @@ def prepare_variable( _ensure_no_forward_slash_in_name(name) attrs = variable.attrs.copy() - dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) + dtype = _get_datatype( + variable, nc_format=self.format, raise_on_invalid_encoding=check_encoding + ) fillvalue = attrs.pop("_FillValue", None) @@ -394,6 +421,8 @@ def prepare_variable( nc4_var = self.ds[name] for k, v in attrs.items(): + if self.format == "NETCDF4_CLASSIC": + v = encode_nc3_attr_value(v) nc4_var.attrs[k] = v target = H5NetCDFArrayWrapper(name, self) @@ -484,7 +513,7 @@ def open_dataset( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group=None, lock=None, invalid_netcdf=None, @@ -544,7 +573,7 @@ def open_datatree( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group: str | None = None, lock=None, invalid_netcdf=None, @@ -587,7 +616,7 @@ def open_groups_as_dict( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group: str | None = None, lock=None, invalid_netcdf=None, diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index ce0d39b6ad0..eb3550ff31a 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -233,6 +233,10 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool): "h5netcdf", "1.4.0.dev" ) +has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip( + "h5netcdf", "1.7.0.dev" +) + has_netCDF4_1_7_0_or_above, requires_netCDF4_1_7_0_or_above = _importorskip( "netCDF4", "1.7.0" ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3562ffd7017..a65ff222a63 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -90,6 +90,7 @@ requires_fsspec, requires_h5netcdf, requires_h5netcdf_1_4_0_or_above, + requires_h5netcdf_1_7_0_or_above, requires_h5netcdf_or_netCDF4, requires_h5netcdf_ros3, requires_iris, @@ -461,6 +462,7 @@ def roundtrip( save_kwargs = {} if open_kwargs is None: open_kwargs = {} + with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path: self.save(data, path, **save_kwargs) with self.open(path, **open_kwargs) as ds: @@ -4736,6 +4738,54 @@ def create_store(self): ) as store: yield store + @requires_h5netcdf + def test_string_attributes_stored_as_char(self, tmp_path): + import h5netcdf + + original = Dataset(attrs={"foo": "bar"}) + store_path = tmp_path / "tmp.nc" + original.to_netcdf(store_path, engine=self.engine, format=self.file_format) + with h5netcdf.File(store_path, "r") as ds: + # Check that the attribute is stored as a char array + assert ds._h5file.attrs["foo"].dtype == np.dtype("S3") + + +@requires_h5netcdf_1_7_0_or_above +class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data): + engine: T_NetcdfEngine = "h5netcdf" + file_format: T_NetcdfTypes = "NETCDF4_CLASSIC" + + @contextlib.contextmanager + def create_store(self): + with create_tmp_file() as tmp_file: + with backends.H5NetCDFStore.open( + tmp_file, mode="w", format="NETCDF4_CLASSIC" + ) as store: + yield store + + @requires_netCDF4 + def test_cross_engine_read_write_netcdf4(self) -> None: + # Drop dim3, because its labels include strings. These appear to be + # not properly read with python-netCDF4, which converts them into + # unicode instead of leaving them as bytes. + data = create_test_data().drop_vars("dim3") + data.attrs["foo"] = "bar" + valid_engines: list[T_NetcdfEngine] = ["netcdf4", "h5netcdf"] + for write_engine in valid_engines: + with create_tmp_file() as tmp_file: + data.to_netcdf(tmp_file, engine=write_engine, format=self.file_format) + for read_engine in valid_engines: + with open_dataset(tmp_file, engine=read_engine) as actual: + assert_identical(data, actual) + + def test_group_fails(self): + # Check writing group data fails with CLASSIC format + original = create_test_data() + with pytest.raises( + ValueError, match=r"Cannot create sub-groups in `NETCDF4_CLASSIC` format." + ): + original.to_netcdf(group="sub", format=self.file_format, engine=self.engine) + @requires_scipy_or_netCDF4 class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase):