diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 35c2e96924..fa63002f09 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine): ), ) if problem is not None: - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Skipping disallowed global attribute '{attr_name}' (see above error)" ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] ################################################################################ @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate( ) if problem is not None: coord_var_name = str(cf_coord_var.cf_name) - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Failed to create {coord_var_name} dimension coordinate:\n" f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead." ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] problem.handled = True _ = _add_or_capture( @@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. - common_dims = [ - dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions - ] + coord_dims = cf_coord_var.dimensions + if cf._is_str_dtype(cf_coord_var): + coord_dims = coord_dims[:-1] + datavar_dims = engine.cf_var.dimensions + if cf._is_str_dtype(engine.cf_var): + datavar_dims = datavar_dims[:-1] + common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: # Calculate the offset of each common dimension. diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 2b6568c315..6e4b8f99e1 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -26,7 +26,7 @@ import iris.exceptions import iris.fileformats._nc_load_rules.helpers as hh -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets from iris.mesh.components import Connectivity import iris.util import iris.warnings @@ -790,19 +790,31 @@ def cf_label_data(self, cf_data_var): # Determine the name of the label string (or length) dimension by # finding the dimension name that doesn't exist within the data dimensions. - str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions)) + str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions)) + n_nondata_dims = len(str_dim_names) + + if n_nondata_dims == 0: + # *All* dims are shared with the data-variable. + # This is only ok if the data-var is *also* a string type. + dim_ok = _is_str_dtype(cf_data_var) + # In this case, we must just *assume* that the last dimension is "the" + # string dimension + str_dim_name = self.dimensions[-1] + else: + # If there is exactly one non-data dim, that is the one we want + dim_ok = len(str_dim_names) == 1 + (str_dim_name,) = str_dim_names - if len(str_dim_name) != 1: + if not dim_ok: raise ValueError( "Invalid string dimensions for CF-netCDF label variable %r" % self.cf_name ) - str_dim_name = str_dim_name[0] label_data = self[:] if ma.isMaskedArray(label_data): - label_data = label_data.filled() + label_data = label_data.filled(b"\0") # Determine whether we have a string-valued scalar label # i.e. a character variable that only has one dimension (the length of the string). @@ -1361,7 +1373,9 @@ def __init__(self, file_source, warn=False, monotonic=False): if isinstance(file_source, str): # Create from filepath : open it + own it (=close when we die). self._filename = os.path.expanduser(file_source) - self._dataset = _thread_safe_nc.DatasetWrapper(self._filename, mode="r") + self._dataset = _bytecoding_datasets.EncodedDataset( + self._filename, mode="r" + ) self._own_file = True else: # We have been passed an open dataset. diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py new file mode 100644 index 0000000000..5d9452ae94 --- /dev/null +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -0,0 +1,301 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Module providing to netcdf datasets with automatic character encoding. + +The requirement is to convert numpy fixed-width unicode arrays on writing to a variable +which is declared as a byte (character) array with a fixed-length string dimension. + +Numpy unicode string arrays are ones with dtypes of the form "U". +Numpy character variables have the dtype "S1", and map to a fixed-length "string +dimension". + +In principle, netCDF4 already performs these translations, but in practice current +releases are not functional for anything other than "ascii" encoding -- including UTF-8, +which is the most obvious and desirable "general" solution. + +There is also the question of whether we should like to implement UTF-8 as our default. +Current discussions on this are inconclusive and neither CF conventions nor the NetCDF +User Guide are definite on what possible values of "_Encoding" are, or what the effective +default is, even though they do both mention the "_Encoding" attribute as a potential +way to handle the issue. + +Because of this, we interpret as follows: + * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to + decode bytes as UTF-8 + * when writing strings : in the absence of an "_Encoding" attribute (on the Iris + cube or coord object), we will attempt to encode data with "ascii" : If this fails, + it raise an error prompting the user to supply an "_Encoding" attribute. + +Where an "_Encoding" attribute is provided to Iris, we will honour it where possible, +identifying with "codecs.lookup" : This means we support the encodings in the Python +Standard Library, and the name aliases which it recognises. + +See: + +* known problems https://github.com/Unidata/netcdf4-python/issues/1440 +* suggestions for how this "ought" to work, discussed in the netcdf-c library + * https://github.com/Unidata/netcdf-c/issues/402 + +""" + +import codecs +import contextlib +import threading +import warnings + +import numpy as np + +from iris.fileformats.netcdf._thread_safe_nc import ( + DatasetWrapper, + NetCDFDataProxy, + NetCDFWriteProxy, + VariableWrapper, +) + + +def decode_bytesarray_to_stringarray( + byte_array: np.ndarray, encoding: str, string_width: int +) -> np.ndarray: + """Convert an array of bytes to an array of strings, with one less dimension. + + N.B. for now at least, we assume the string dim is **always the last one**. + If 'string_width' is not given, it is set to the final dimension of 'byte_array'. + """ + if np.ma.isMaskedArray(byte_array): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + byte_array = byte_array.data + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + string_dtype = f"U{string_width}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +# +# TODO: remove? +# this older version is "overly flexible", less efficient and not needed here. +# +def flexi_encode_stringarray_as_bytearray( + data: np.ndarray, encoding=None, string_dimension_length: int | None = None +) -> np.ndarray: + """Encode strings as bytearray. + + Note: if 'string_dimension_length' is not given (None), it is set to the longest + encoded bytes element, **OR** the dtype size, if that is greater. + If 'string_dimension_length' is specified, the last array + dimension is set to this and content strings are truncated or extended as required. + """ + if np.ma.isMaskedArray(data): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + data = data.data + element_shape = data.shape + # Encode all the strings + see which is longest + max_length = 1 # this is a MINIMUM - i.e. not zero! + data_elements = np.zeros(element_shape, dtype=object) + for index in np.ndindex(element_shape): + data_element = data[index].encode(encoding=encoding) + element_length = len(data_element) + data_elements[index] = data_element + if element_length > max_length: + max_length = element_length + + if string_dimension_length is None: + # If the string length was not specified, it is the maximum encoded length + # (n-bytes), **or** the dtype string-length, if greater. + string_dimension_length = max_length + array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way? + if array_string_length > string_dimension_length: + string_dimension_length = array_string_length + + # We maybe *already* encoded all the strings above, but stored them in an + # object-array as we didn't yet know the fixed byte-length to convert to. + # Now convert to a fixed-width byte array with an extra string-length dimension + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + bytes = data_elements[index] + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +def encode_stringarray_as_bytearray( + data: np.typing.ArrayLike, encoding: str, string_dimension_length: int +) -> np.ndarray: + """Encode strings as a bytes array.""" + data = np.asanyarray(data) + element_shape = data.shape + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + string = data[index] + bytes = string.encode(encoding=encoding) + n_bytes = len(bytes) + # TODO: may want to issue warning or error if we overflow the length? + if n_bytes > string_dimension_length: + from iris.exceptions import TranslationError + + msg = ( + f"Non-ascii string {string!r} written to netcdf exceeds string " + f"dimension : {n_bytes} > {string_dimension_length}." + ) + raise TranslationError(msg) + + # It's all a bit nasty ... + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +class NetcdfStringDecodeSetting(threading.local): + def __init__(self, perform_encoding: bool = True): + self.set(perform_encoding) + + def set(self, perform_encoding: bool): + self.perform_encoding = perform_encoding + + def __bool__(self): + return self.perform_encoding + + @contextlib.contextmanager + def context(self, perform_encoding: bool): + old_setting = self.perform_encoding + self.perform_encoding = perform_encoding + yield + self.perform_encoding = old_setting + + +DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting() +DEFAULT_READ_ENCODING = "utf-8" +DEFAULT_WRITE_ENCODING = "ascii" + + +class EncodedVariable(VariableWrapper): + """A variable wrapper that translates variable data according to byte encodings.""" + + def __getitem__(self, keys): + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) + + data = super().__getitem__(keys) + + if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice + strlen = self._get_string_width() + try: + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {self.name!r} could not be decoded " + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err + + return data + + def __setitem__(self, keys, data): + data = np.asanyarray(data) + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) + + # N.B. typically, write encoding default is "ascii" --> fails bad content + if data.dtype.kind == "U": + try: + encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + strlen = self._get_byte_width() + data = encode_stringarray_as_bytearray(data, encoding, strlen) + except UnicodeEncodeError as err: + msg = ( + f"String data written to netcdf character variable {self.name!r} " + f"could not be represented in encoding {encoding!r}. This can be " + "fixed by setting a suitable variable '_Encoding' attribute, " + 'e.g. ._Encoding="UTF-8".' + ) + raise ValueError(msg) from err + + super().__setitem__(keys, data) + + def _is_chardata(self): + return np.issubdtype(self.dtype, np.bytes_) + + def _get_encoding(self) -> str | None: + """Get the byte encoding defined for this variable (or None).""" + result = getattr(self, "_Encoding", None) + if result is not None: + try: + # Accept + normalise naming of encodings + result = codecs.lookup(result).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Unrecognised encoding name : handle this as just a warning + msg = f"Unknown encoding for variable {self.name!r}: {result!r}" + warnings.warn(msg, UserWarning) + + return result + + def _get_byte_width(self) -> int | None: + if not hasattr(self, "_bytewidth"): + n_bytes = self.group().dimensions[self.dimensions[-1]].size + # Cache this length control on the variable -- but not as a netcdf attribute + self.__dict__["_bytewidth"] = n_bytes + + return self.__dict__["_bytewidth"] + + def _get_string_width(self): + """Return the string-length defined for this variable.""" + if not hasattr(self, "_strlen"): + # Work out the actual byte width from the parent dataset dimensions. + strlen = self._get_byte_width() + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the encoding used. + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # regularise the name for comparison with recognised ones + encoding = codecs.lookup(encoding).name + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes + + # Cache this length control on the variable -- but not as a netcdf attribute + self.__dict__["_strlen"] = strlen + + return self._strlen + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type." + raise TypeError(msg) + + +class EncodedDataset(DatasetWrapper): + """A specialised DatasetWrapper whose variables perform byte encoding.""" + + VAR_WRAPPER_CLS = EncodedVariable + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type." + raise TypeError(msg) + + +class EncodedNetCDFDataProxy(NetCDFDataProxy): + DATASET_CLASS = EncodedDataset + + +class EncodedNetCDFWriteProxy(NetCDFWriteProxy): + DATASET_CLASS = EncodedDataset diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 33183ef0fa..cd97452dac 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper): CONTAINED_CLASS = netCDF4.Group # Note: will also accept a whole Dataset object, but that is OK. _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"] + # Class to use when creating variable wrappers (default=VariableWrapper). + # - needed to support _byte_encoded_data.EncodedDataset. + VAR_WRAPPER_CLS = VariableWrapper # All Group API that returns Dimension(s) is wrapped to instead return # DimensionWrapper(s). @@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]: """ with _GLOBAL_NETCDF4_LOCK: variables_ = self._contained_instance.variables - return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()} + return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()} def createVariable(self, *args, **kwargs) -> VariableWrapper: """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK. @@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper: """ with _GLOBAL_NETCDF4_LOCK: new_variable = self._contained_instance.createVariable(*args, **kwargs) - return VariableWrapper.from_existing(new_variable) + return self.VAR_WRAPPER_CLS.from_existing(new_variable) def get_variables_by_attributes( self, *args, **kwargs @@ -234,7 +237,7 @@ def get_variables_by_attributes( variables_ = list( self._contained_instance.get_variables_by_attributes(*args, **kwargs) ) - return [VariableWrapper.from_existing(v) for v in variables_] + return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_] # All Group API that returns Group(s) is wrapped to instead return # GroupWrapper(s). @@ -252,7 +255,7 @@ def groups(self): """ with _GLOBAL_NETCDF4_LOCK: groups_ = self._contained_instance.groups - return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()} + return {k: self.__class__.from_existing(v) for k, v in groups_.items()} @property def parent(self): @@ -268,7 +271,7 @@ def parent(self): """ with _GLOBAL_NETCDF4_LOCK: parent_ = self._contained_instance.parent - return GroupWrapper.from_existing(parent_) + return self.__class__.from_existing(parent_) def createGroup(self, *args, **kwargs): """Call createGroup() from netCDF4.Group/Dataset. @@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs): """ with _GLOBAL_NETCDF4_LOCK: new_group = self._contained_instance.createGroup(*args, **kwargs) - return GroupWrapper.from_existing(new_group) + return self.__class__.from_existing(new_group) class DatasetWrapper(GroupWrapper): @@ -312,6 +315,7 @@ class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") + DATASET_CLASS = netCDF4.Dataset def __init__(self, shape, dtype, path, variable_name, fill_value): self.shape = shape @@ -334,7 +338,7 @@ def __getitem__(self, keys): # netCDF4 library, presumably because __getitem__ gets called so many # times by Dask. Use _GLOBAL_NETCDF4_LOCK directly instead. with _GLOBAL_NETCDF4_LOCK: - dataset = netCDF4.Dataset(self.path) + dataset = self.DATASET_CLASS(self.path) try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. @@ -371,6 +375,8 @@ class NetCDFWriteProxy: TODO: could be improved with a caching scheme, but this just about works. """ + DATASET_CLASS = netCDF4.Dataset + def __init__(self, filepath, cf_var, file_write_lock): self.path = filepath self.varname = cf_var.name @@ -398,7 +404,7 @@ def __setitem__(self, keys, array_data): # investigation needed. for attempt in range(5): try: - dataset = netCDF4.Dataset(self.path, "r+") + dataset = self.DATASET_CLASS(self.path, "r+") break except OSError: if attempt < 4: diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 219f681e67..d363e29738 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -36,7 +36,7 @@ import iris.coord_systems import iris.coords import iris.fileformats.cf -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc from iris.fileformats.netcdf.saver import _CF_ATTRS import iris.io import iris.util @@ -50,7 +50,7 @@ # An expected part of the public loader API, but includes thread safety # concerns so is housed in _thread_safe_nc. -NetCDFDataProxy = _thread_safe_nc.NetCDFDataProxy +NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy class _WarnComboIgnoringBoundsLoad( diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 5177749c07..d268919de9 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -14,6 +14,7 @@ """ +import codecs import collections from itertools import repeat, zip_longest import os @@ -48,7 +49,8 @@ from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord import iris.exceptions import iris.fileformats.cf -from iris.fileformats.netcdf import _dask_locks, _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets as bytecoding_datasets +from iris.fileformats.netcdf import _dask_locks from iris.fileformats.netcdf._attribute_handlers import ATTRIBUTE_HANDLERS import iris.io import iris.util @@ -300,7 +302,7 @@ class VariableEmulator(typing.Protocol): shape: tuple[int, ...] -CFVariable = typing.Union[_thread_safe_nc.VariableWrapper, VariableEmulator] +CFVariable = typing.Union[bytecoding_datasets.VariableWrapper, VariableEmulator] class Saver: @@ -403,7 +405,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Put it inside a _thread_safe_nc wrapper to ensure thread-safety. # Except if it already is one, since they forbid "re-wrapping". if not hasattr(self._dataset, "THREAD_SAFE_FLAG"): - self._dataset = _thread_safe_nc.DatasetWrapper.from_existing( + self._dataset = bytecoding_datasets.DatasetWrapper.from_existing( self._dataset ) @@ -414,7 +416,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Given a filepath string/path : create a dataset from that try: self.filepath = os.path.abspath(filename) - self._dataset = _thread_safe_nc.DatasetWrapper( + self._dataset = bytecoding_datasets.EncodedDataset( self.filepath, mode="w", format=netcdf_format ) except RuntimeError: @@ -759,7 +761,7 @@ def _create_cf_dimensions(self, cube, dimension_names, unlimited_dimensions=None # used for a different one pass else: - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) unlimited_dim_names.append(dim_name) for dim_name in dimension_names: @@ -990,12 +992,12 @@ def _add_aux_coords( ] # Include any relevant mesh location coordinates. - mesh: MeshXY | None = getattr(cube, "mesh") - mesh_location: str | None = getattr(cube, "location") + mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked] + mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked] if mesh and mesh_location: location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr( mesh, f"{mesh_location}_coords" - ) + ) # type: ignore[annotation-unchecked] coords_to_add.extend(list(location_coords)) return self._add_inner_related_vars( @@ -1365,7 +1367,7 @@ def record_dimension(names_list, dim_name, length, matching_coords=None): if dim_name is None: # Not already present : create a unique dimension name # from the coord. - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) # Disambiguate if it has the same name as an # existing dimension. # OR if it matches an existing file variable name. @@ -1541,38 +1543,14 @@ def _create_cf_bounds(self, coord, cf_var, cf_name, /, *, compression_kwargs=Non ) self._lazy_stream_data(data=bounds, cf_var=cf_var_bounds) - def _get_cube_variable_name(self, cube): - """Return a CF-netCDF variable name for the given cube. - - Parameters - ---------- - cube : :class:`iris.cube.Cube` - An instance of a cube for which a CF-netCDF variable - name is required. - - Returns - ------- - str - A CF-netCDF variable name as a string. - - """ - if cube.var_name is not None: - cf_name = cube.var_name - else: - # Convert to lower case and replace whitespace by underscores. - cf_name = "_".join(cube.name().lower().split()) - - cf_name = self.cf_valid_var_name(cf_name) - return cf_name - - def _get_coord_variable_name(self, cube_or_mesh, coord): - """Return a CF-netCDF variable name for a given coordinate-like element. + def _get_element_variable_name(self, cube_or_mesh, element): + """Return a CF-netCDF variable name for a given coordinate-like element, or cube. Parameters ---------- cube_or_mesh : :class:`iris.cube.Cube` or :class:`iris.mesh.MeshXY` The Cube or Mesh being saved to the netCDF file. - coord : :class:`iris.coords._DimensionalMetadata` + element : :class:`iris.coords._DimensionalMetadata` | :class:``iris.cube.Cube`` An instance of a coordinate (or similar), for which a CF-netCDF variable name is required. @@ -1592,17 +1570,21 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): cube = None mesh = cube_or_mesh - if coord.var_name is not None: - cf_name = coord.var_name + if element.var_name is not None: + cf_name = element.var_name + elif isinstance(element, Cube): + # Make name for a Cube without a var_name. + cf_name = "_".join(element.name().lower().split()) else: - name = coord.standard_name or coord.long_name + # Make name for a Coord-like element without a var_name + name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(coord): + if cube is not None and cube.coords(element): # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" - for dim in cube.coord_dims(coord): + for dim in cube.coord_dims(element): name += f"dim{dim}" # Handle scalar coordinate (dims == ()). if not name: @@ -1616,8 +1598,8 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): # At present, a location-coord cannot be nameless, as the # MeshXY code relies on guess_coord_axis. - assert isinstance(coord, Connectivity) - location = coord.cf_role.split("_")[0] + assert isinstance(element, Connectivity) + location = element.cf_role.split("_")[0] location_dim_attr = f"{location}_dimension" name = getattr(mesh, location_dim_attr) @@ -1693,6 +1675,8 @@ def _create_mesh(self, mesh): return cf_mesh_name def _set_cf_var_attributes(self, cf_var, element): + from iris.cube import Cube + # Deal with CF-netCDF units, and add the name+units properties. if isinstance(element, iris.coords.Coord): # Fix "degree" units if needed. @@ -1700,34 +1684,50 @@ def _set_cf_var_attributes(self, cf_var, element): else: units_str = str(element.units) - if cf_units.as_unit(units_str).is_udunits(): - _setncattr(cf_var, "units", units_str) - - standard_name = element.standard_name - if standard_name is not None: - _setncattr(cf_var, "standard_name", standard_name) - - long_name = element.long_name - if long_name is not None: - _setncattr(cf_var, "long_name", long_name) + # NB this bit is a nasty hack to preserve existing behaviour through a refactor: + # The attributes for Coords are created in the order units, standard_name, + # whereas for data-variables (aka Cubes) it is the other way around. + # Needed now that this routine is also called from _create_cf_data_variable. + # TODO: when we can break things, rationalise these to be the same. + def add_units_attr(): + if cf_units.as_unit(units_str).is_udunits(): + _setncattr(cf_var, "units", units_str) + + def add_names_attrs(): + standard_name = element.standard_name + if standard_name is not None: + _setncattr(cf_var, "standard_name", standard_name) + + long_name = element.long_name + if long_name is not None: + _setncattr(cf_var, "long_name", long_name) + + if isinstance(element, Cube): + add_names_attrs() + add_units_attr() + else: + add_units_attr() + add_names_attrs() # Add the CF-netCDF calendar attribute. if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) - # Add any other custom coordinate attributes. - for name in sorted(element.attributes): - value = element.attributes[name] + if not isinstance(element, Cube): + # Add any other custom coordinate attributes. + # N.B. not Cube, which has specific handling in _create_cf_data_variable + for name in sorted(element.attributes): + value = element.attributes[name] - if name == "STASH": - # Adopting provisional Metadata Conventions for representing MO - # Scientific Data encoded in NetCDF Format. - name = "um_stash_source" - value = str(value) + if name == "STASH": + # Adopting provisional Metadata Conventions for representing MO + # Scientific Data encoded in NetCDF Format. + name = "um_stash_source" + value = str(value) - # Don't clobber existing attributes. - if not hasattr(cf_var, name): - _setncattr(cf_var, name, value) + # Don't clobber existing attributes. + if not hasattr(cf_var, name): + _setncattr(cf_var, name, value) def _create_generic_cf_array_var( self, @@ -1739,6 +1739,8 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + packing_controls: dict | None = None, + is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1791,7 +1793,7 @@ def _create_generic_cf_array_var( # Work out the var-name to use. # N.B. the only part of this routine that may use a mesh _or_ a cube. - cf_name = self._get_coord_variable_name(cube_or_mesh, element) + cf_name = self._get_element_variable_name(cube_or_mesh, element) while cf_name in self._dataset.variables: cf_name = self._increment_name(cf_name) @@ -1804,18 +1806,29 @@ def _create_generic_cf_array_var( # Get the data values, in a way which works for any element type, as # all are subclasses of _DimensionalMetadata. # (e.g. =points if a coord, =data if an ancillary, etc) - data = element._core_values() + if isinstance(element, Cube): + data = element.core_data() + else: + data = element._core_values() # This compression contract is *not* applicable to a mesh. - if cube and cube.shape != data.shape: + if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} - if np.issubdtype(data.dtype, np.str_): + if not is_dataless and np.issubdtype(data.dtype, np.str_): # Deal with string-type variables. # Typically CF label variables, but also possibly ancil-vars ? string_dimension_depth = data.dtype.itemsize if data.dtype.kind == "U": - string_dimension_depth //= 4 + encoding = element.attributes.get("_Encoding", "ascii") + # TODO: this can fail -- use a sensible warning + default? + encoding = codecs.lookup(encoding).name + if encoding == "utf-32": + # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4 + string_dimension_depth += 4 + else: + # generally, 4 bytes per char in numpy --> make bytewidth = string-width + string_dimension_depth //= 4 string_dimension_name = "string%d" % string_dimension_depth # Determine whether to create the string length dimension. @@ -1834,28 +1847,38 @@ def _create_generic_cf_array_var( # Create the label coordinate variable. cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) - # Convert data from an array of strings into a character array - # with an extra string-length dimension. - if len(element_dims) == 1: - data_first = data[0] - if is_lazy_data(data_first): - data_first = dask.compute(data_first) - data = list("%- *s" % (string_dimension_depth, data_first)) - else: - orig_shape = data.shape - new_shape = orig_shape + (string_dimension_depth,) - new_data = np.zeros(new_shape, cf_var.dtype) - for index in np.ndindex(orig_shape): - index_slice = tuple(list(index) + [slice(None, None)]) - new_data[index_slice] = list( - "%- *s" % (string_dimension_depth, data[index]) - ) - data = new_data + # # Convert data from an array of strings into a character array + # # with an extra string-length dimension. + # if len(element_dims) == 1: + # # Scalar variable (only has string dimension). + # data_first = data[0] + # if is_lazy_data(data_first): + # data_first = dask.compute(data_first) + # data = list("%- *s" % (string_dimension_depth, data_first)) + # else: + # # NOTE: at present, can't do this lazily?? + # orig_shape = data.shape + # new_shape = orig_shape + (string_dimension_depth,) + # new_data = np.zeros(new_shape, cf_var.dtype) + # for index in np.ndindex(orig_shape): + # index_slice = tuple(list(index) + [slice(None, None)]) + # new_data[index_slice] = list( + # "%- *s" % (string_dimension_depth, data[index]) + # ) + # data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. - element_type = type(element).__name__ - data = self._ensure_valid_dtype(data, element_type, element) + if is_dataless: + dtype = self._DATALESS_DTYPE + fill_value = self._DATALESS_FILLVALUE + else: + element_type = type(element).__name__ + data = self._ensure_valid_dtype(data, element_type, element) + if not packing_controls: + dtype = data.dtype.newbyteorder("=") + else: + dtype = packing_controls["dtype"] # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1869,7 +1892,7 @@ def _create_generic_cf_array_var( # Create the CF-netCDF variable. cf_var = self._dataset.createVariable( cf_name, - data.dtype.newbyteorder("="), + dtype, element_dims, fill_value=fill_value, **compression_kwargs, @@ -1886,12 +1909,19 @@ def _create_generic_cf_array_var( element, cf_var, cf_name, compression_kwargs=compression_kwargs ) - # Add the data to the CF-netCDF variable. - self._lazy_stream_data(data=data, cf_var=cf_var) - # Add names + units + # NOTE: *must* now do first, as we may need '_Encoding' set to write it ! self._set_cf_var_attributes(cf_var, element) + # Add the data to the CF-netCDF variable. + if not is_dataless: + if packing_controls: + # We must set packing attributes (if any), before assigning values. + for key, value in packing_controls["attributes"]: + _setncattr(cf_var, key, value) + + self._lazy_stream_data(data=data, cf_var=cf_var) + return cf_name def _create_cf_cell_methods(self, cube, dimension_names): @@ -2238,9 +2268,9 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube): cfvar = self._name_coord_map.name(coord) if not cfvar: # not found - create and store it: - cfvar = self._get_coord_variable_name(cube, coord) + cfvar = self._get_element_variable_name(cube, coord) self._name_coord_map.append( - cfvar, self._get_coord_variable_name(cube, coord) + cfvar, self._get_element_variable_name(cube, coord) ) cfvar_names.append(cfvar) @@ -2320,18 +2350,10 @@ def _create_cf_data_variable( # be removed. # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if is_dataless: - data = None - else: - data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if is_dataless: - # The variable must have *some* dtype, and it must be maskable - dtype = self._DATALESS_DTYPE - fill_value = self._DATALESS_FILLVALUE - elif not packing: - dtype = data.dtype.newbyteorder("=") - else: + packing_controls = None + if packing and not is_dataless: + data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) if isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." @@ -2370,45 +2392,29 @@ def _create_cf_data_variable( else: add_offset = cmin + 2 ** (n - 1) * scale_factor - def set_packing_ncattrs(cfvar): - """Set netCDF packing attributes. - - NOTE: cfvar needs to be a _thread_safe_nc._ThreadSafeWrapper subclass. - - """ - assert hasattr(cfvar, "THREAD_SAFE_FLAG") - if packing: - if scale_factor: - _setncattr(cfvar, "scale_factor", scale_factor) - if add_offset: - _setncattr(cfvar, "add_offset", add_offset) - - cf_name = self._get_cube_variable_name(cube) - while cf_name in self._dataset.variables: - cf_name = self._increment_name(cf_name) + packing_controls = { + "dtype": dtype, + "attributes": [ + ("scale_factor", scale_factor), + ("add_offset", add_offset), + ], + } # Create the cube CF-netCDF data variable with data payload. - cf_var = self._dataset.createVariable( - cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + cf_name = self._create_generic_cf_array_var( + cube, + dimension_names, + cube, + element_dims=dimension_names, + fill_value=fill_value, + compression_kwargs=kwargs, + packing_controls=packing_controls, + is_dataless=is_dataless, ) + cf_var = self._dataset.variables[cf_name] - if not is_dataless: - set_packing_ncattrs(cf_var) - self._lazy_stream_data(data=data, cf_var=cf_var) - - if cube.standard_name: - _setncattr(cf_var, "standard_name", cube.standard_name) - - if cube.long_name: - _setncattr(cf_var, "long_name", cube.long_name) - - if cube.units.is_udunits(): - _setncattr(cf_var, "units", str(cube.units)) - - # Add the CF-netCDF calendar attribute. - if cube.units.calendar: - _setncattr(cf_var, "calendar", cube.units.calendar) - + # Set general attrs: NB this part is cube-specific (not the same for components) + # - so 'set_cf_var_attributes' *doesn't* set these, if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: @@ -2535,7 +2541,7 @@ def store( ) -> None: # Create a data-writeable object that we can stream into, which # encapsulates the file to be opened + variable to be written. - write_wrapper = _thread_safe_nc.NetCDFWriteProxy( + write_wrapper = bytecoding_datasets.EncodedNetCDFWriteProxy( self.filepath, cf_var, self.file_write_lock ) # Add to the list of delayed writes, used in delayed_completion(). diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..9698e7d4c6 --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,244 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for string data handling.""" + +import subprocess + +import numpy as np +import pytest + +import iris +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube +from iris.fileformats.netcdf import _bytecoding_datasets + +# from iris.fileformats.netcdf import _thread_safe_nc +from iris.tests import env_bin_path + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + +# VARS_COORDS_SHARE_STRING_DIM = True +VARS_COORDS_SHARE_STRING_DIM = False +if VARS_COORDS_SHARE_STRING_DIM: + # Fix length so that the max coord strlen will be same as data one + TEST_COORD_VALS[-1] = "Xsandwich" + + +# Ensure all tests run with "split attrs" turned on. +@pytest.fixture(scope="module", autouse=True) +def enable_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + +INCLUDE_NUMERIC_AUXCOORD = True +# INCLUDE_NUMERIC_AUXCOORD = False + + +# DATASET_CLASS = _thread_safe_nc.DatasetWrapper +DATASET_CLASS = _bytecoding_datasets.EncodedDataset + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + ds = DATASET_CLASS(filepath, "w") + try: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + if INCLUDE_NUMERIC_AUXCOORD: + v_num = ds.createVariable( + "v_num", + float, + dimensions=("x",), + ) + v_num[:] = np.arange(NX) + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + coords_str = "v_co" + if INCLUDE_NUMERIC_AUXCOORD: + coords_str += " v_num" + v.coordinates = coords_str + finally: + ds.close() + + +def make_testcube( + dataarray, + coordarray, # for now, these are always *string* arrays + encoding_str: str | None = None, +): + cube = Cube(dataarray, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0) + if encoding_str is not None: + cube.attributes["_Encoding"] = encoding_str + if INCLUDE_COORD: + co_x = AuxCoord(coordarray, var_name="v_co") + if encoding_str is not None: + co_x.attributes["_Encoding"] = encoding_str + cube.add_aux_coord(co_x, 0) + return cube + + +NCDUMP_PATHSTR = str(env_bin_path("ncdump")) + + +def ncdump(nc_path: str, *args): + """Call ncdump to print a dump of a file.""" + call_args = [NCDUMP_PATHSTR, nc_path] + list(args) + bytes = subprocess.check_output(call_args) + text = bytes.decode("utf-8") + print(text) + return text + + +def show_result(filepath): + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath) + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + iris.loading.LOAD_PROBLEMS.reset() + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS.problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + print("-numeric auxcoord data-") + print(repr(cube.coord("x").points)) + if INCLUDE_COORD: + print("-string auxcoord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +@pytest.fixture(scope="session") +def save_dir(tmp_path_factory): + return tmp_path_factory.mktemp("save_files") + + +# TODO: the tests don't test things properly yet, they just exercise the code and print +# things for manual debugging. +tsts = ( + None, + "ascii", + "utf-8", + "utf-32", +) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +# tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_load_encodings(encoding, save_dir): + # small change + print(f"\n=========\nTesting encoding: {encoding}") + filepath = save_dir / f"tmp_load_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_strings_to_chararray( + TEST_STRINGS, N_STRLEN, encoding=do_as + ) + TEST_COORDARRAY = convert_strings_to_chararray( + TEST_COORD_VALS, N_STRLEN, encoding=do_as + ) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath) + + +@pytest.mark.parametrize("encoding", tsts) +def test_save_encodings(encoding, save_dir): + cube = make_testcube( + dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding + ) + print(cube) + filepath = save_dir / f"tmp_save_{str(encoding)}.nc" + if encoding == "ascii": + with pytest.raises( + UnicodeEncodeError, + match="'ascii' codec can't encode character.*not in range", + ): + iris.save(cube, filepath) + else: + iris.save(cube, filepath) + show_result(filepath) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py index a44986ec98..5ed3413409 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py @@ -44,7 +44,9 @@ def setUp(self): self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar"), cf_data=cf_data), + cf_var=mock.Mock( + dimensions=("foo", "bar"), cf_data=cf_data, dtype=np.int32 + ), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -174,7 +176,7 @@ def setUp(self): self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -244,7 +246,7 @@ def setUp(self): # Create dummy pyke engine. self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.float32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py index a871c967ab..26e25a6d95 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py @@ -50,7 +50,7 @@ def setUp(self): # Create dummy pyke engine. self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt new file mode 100644 index 0000000000..07a0bc3bcd --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -0,0 +1,191 @@ +=========== +Outstanding Qs +* What would we like to do with all this IN IRIS?? + - generally present as string arrays (Uxx) + - existing scheme of naming dims for length + re-using is quite cunning! + - choice of seeing actual character arrays as alternative to string conversions? + +* string length handling for load/save/roundtrip + - on SAVE, we need some control so we can create files which are compatible, + irrespective of the data (which currently we are not doing) + - ALSO this is wanted to ensure that multiple vars (e.g. string cubes or string coords) + will share the string dim -- instead of creating arbitrary different ones + - presumably, if encoding blows the max-len, we must get a warning/error + + - on LOAD, we may want to *capture* the actual original string dim length, so it can be + re-created on save (by some scheme, as per previous) -- i.e. enable roundtripping. + I don't really want to preserve the name of the string dim, but this could be a + slightly tender point. To consider also : the impact of this on the non-equivalence + of loaded cubes, if we use actual *attributes* to carry this info (see below). + - **if not** : just load data + convert to string arrays as seems best + - this will also lead to incompatible cubes. + + - on SAVE, in the absence of strlen-controls, what is a reasonable default choice? + - take longest encoded + - set nbytes = NEXPAND(encoding) * nchars + - sensible values would depend on the encoding... + : ascii -> 1 + : utf-8 -> 1 or 4 ??? + : utf-16 -> 2 or 4 ??? + : utf-32 -> 4 + + - on LOAD, in absence of strlen controls, how do we choose the result DTYPE (i.e. character length)? + - again, may depend on the encoding: + : ascii = "U" + : UTF-8 = "U" + : UTF-16 = "U" + : UTF-32 = "U" + - N.B. these are ll at least "safe" - i.e. won't lose characters + + +separately from these, there is the question of how the controls affect "normal" +cube operations. + - the easiest approach is to define a "special" attribute, + which can be set on any cube/component + - using the dtype-length of the data would be *possible*, in conjunction with the + above-proposed "default rules" for choosing strlen from the dtype. + But this might not round-trip in all cases. + +within the actual data arrays + - we can't really expect any different to what numpy does + - that is, the dtype-length of any element <= that of the array (and not ==) + this may be tricky, but we can't easily prevent it. + >>> a = np.array(['', 'a', 'bb']) + >>> a + array(['', 'a', 'bb'], dtype='>> a[0].dtype + dtype('>> a[1].dtype + dtype('>> a[2].dtype + dtype('>> a.dtype + dtype('>> + - likewise, we can't assign without possible truncation. + If you **want** to expand the supported width, can use ".astype()" first ? + + +======================== +========================= + +forms in files: + * char chardata(dim1, dim2, strlen_xx); # char data + * string data(dim1, dim2); + +netcdf types: +(netcdf docs terms) + NC_BYTE 8-bit signed integer + NC_UBYTE 8-bit unsigned integer + NC_CHAR 8-bit character + NC_STRING variable length character string + +***NOTE*** there is no NC_UCHAR or "unsigned char" type + + +relevant numpy base types (scalar dtypes): + * "S" bytes : np.bytes_ == np.int8 + * "B" unsigned bytes : np.ubyte == np.uint8 + * 'i' ints : np.int_ + * 'u' unsigned ints : np.int_ + * "U" unicode string : np.str_ + +forms in numpy: + * np.ndarray(dtype="S1") # char data + * np.ndarray(dtype="Snn") # char data + * np.ndarray(dtype="Unn") # strings + * np.ndarray(dtype="") + +possibilities in createVariable: +""" + The datatype can be a numpy datatype object, or a string that describes a numpy dtype object ... + datatype can also be a CompoundType instance (for a structured, or compound array), a VLType instance (for a variable-length array), +** or the python str builtin (for a variable-length string array). +** Numpy string and unicode datatypes with length greater than one are aliases for str. +""" + +test types: + "i1" : np.int8 + "u1" : np.uint8 + "S1" : np.byte_ + "U1" : np.str_ + "S" : + "U" : with/without non-ascii content + +save all these to files... +outputs from "test_nc_dtypes.py" test run: + SPEC:i1 SAVED-AS:int8 byte RELOAD-AS:int8 + SPEC:u1 SAVED-AS:uint8 ubyte RELOAD-AS:uint8 + SPEC:S1 SAVED-AS:|S1 char RELOAD-AS: () + SPEC:U1 SAVED-AS: EncodedDataset: + """Create a test EncodedDataset linked to an actual file. + + * strlen becomes the string dimension (i.e. a number of *bytes*) + * a variable "vxs" is created + * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value + """ + ds = EncodedDataset(path, "w") + ds.createDimension("x", 3) + ds.createDimension("strlen", strlen) + v = ds.createVariable("vxs", "S1", ("x", "strlen")) + if encoding is not None: + v.setncattr("_Encoding", encoding) + return ds + + +def fetch_undecoded_var(path, varname): + # Open a path as a "normal" dataset, and return a given variable. + ds_normal = DatasetWrapper(path) + ds_normal._contained_instance.set_auto_chartostring(False) + v = ds_normal.variables[varname] + # Return a variable, rather than its data, so we can check attributes etc. + return v + + +def check_array_matching(arr1, arr2): + """Check for arrays matching shape, dtype and content.""" + assert ( + arr1.shape == arr2.shape and arr1.dtype == arr2.dtype and np.all(arr1 == arr2) + ) + + +def check_raw_content(path, varname, expected_byte_array): + v = fetch_undecoded_var(path, varname) + bytes_result = v[:] + check_array_matching(bytes_result, expected_byte_array) + + +def _make_bytearray_inner(data, bytewidth, encoding): + # Convert to a (list of [lists of..]) strings or bytes to a + # (list of [lists of..]) length-1 bytes with an extra dimension. + if isinstance(data, str): + # Convert input strings to bytes + data = data.encode(encoding) + if isinstance(data, bytes): + # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) + result = [data[i : i + 1] for i in range(len(data))] + # pad or truncate everything to the required bytewidth + result = (result + [b"\0"] * bytewidth)[:bytewidth] + else: + # If not string/bytes, expect the input to be a list. + # N.B. the recursion is inefficient, but we don't care about that here + result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data] + return result + + +def make_bytearray(data, bytewidth, encoding="ascii"): + """Convert bytes or lists of bytes into a numpy byte array. + + This is largely to avoid using "encode_stringarray_as_bytearray", since we don't + want to depend on that when we should be testing it. + So, it mostly replicates the function of that, but it does also support bytes in the + input. + """ + # First, Convert to a (list of [lists of]..) length-1 bytes objects + data = _make_bytearray_inner(data, bytewidth, encoding) + # We should now be able to create an array of single bytes. + result = np.array(data) + assert result.dtype == "S1" + return result + + +class TestWriteStrings: + """Test how string data is saved to a file. + + Mostly, we read back data as a "normal" dataset to avoid relying on the read code, + which is separately tested -- see 'TestReadStrings'. + """ + + def test_encodings(self, encoding, tempdir): + # Create a dataset with the variable + path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc" + + if encoding in [None, "ascii"]: + writedata = samples_3_ascii + write_encoding = "ascii" + else: + writedata = samples_3_nonascii + write_encoding = encoding + + writedata = writedata.copy() # just for safety? + strlen = strings_maxbytes(writedata, write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + + # Effectively, checks that we *can* write strings + v[:] = writedata + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(writedata, strlen, write_encoding) + check_raw_content(path, "vxs", expected_bytes) + + # Check also that the "_Encoding" property is as expected + v = fetch_undecoded_var(path, "vxs") + result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None + assert result_attr == encoding + + def test_scalar(self, tempdir): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / "test_writestrings_scalar.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + # Checks that we *can* write a string + v[:] = np.array("stuff", dtype=str) + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(b"stuff", strlen) + check_raw_content(path, "v0_scalar", expected_bytes) + + def test_multidim(self, tempdir): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / "test_writestrings_multidim.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_data = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + v[:] = test_data + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(test_data, strlen) + check_raw_content(path, "vyxn", expected_bytes) + + def test_write_encoding_failure(self, tempdir): + path = tempdir / "test_writestrings_encoding_failure.nc" + ds = make_encoded_dataset(path, strlen=5, encoding="ascii") + v = ds.variables["vxs"] + msg = ( + "String data written to netcdf character variable 'vxs'.*" + " could not be represented in encoding 'ascii'. " + ) + with pytest.raises(ValueError, match=msg): + v[:] = samples_3_nonascii + + def test_overlength(self, tempdir): + # Check expected behaviour with over-length data + path = tempdir / "test_writestrings_overlength.nc" + strlen = 5 + ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds.variables["vxs"] + v[:] = ["1", "123456789", "two"] + expected_bytes = make_bytearray(["1", "12345", "two"], strlen) + check_raw_content(path, "vxs", expected_bytes) + + def test_overlength_splitcoding(self, tempdir): + # Check expected behaviour when non-ascii multibyte coding gets truncated + path = tempdir / "test_writestrings_overlength_splitcoding.nc" + strlen = 5 + ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8") + v = ds.variables["vxs"] + v[:] = ["1", "1234ü", "two"] + # This creates a problem: it won't read back + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'utf-8' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + + # Check also that we *can* read the raw content. + ds.close() + expected_bytes = [ + b"1", + b"1234\xc3", # NOTE: truncated encoding + b"two", + ] + expected_bytearray = make_bytearray(expected_bytes, strlen) + check_raw_content(path, "vxs", expected_bytearray) + + +class TestWriteChars: + @pytest.mark.parametrize("write_form", ["strings", "bytes"]) + def test_write_chars(self, tempdir, write_form): + encoding = "utf-8" + write_strings = samples_3_nonascii + strlen = strings_maxbytes(write_strings, encoding) + write_bytes = make_bytearray(write_strings, strlen, encoding=encoding) + # NOTE: 'flexi' form util decides the width needs to be 7 !! + path = tempdir / f"test_writechars_{write_form}.nc" + ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen) + v = ds.variables["vxs"] + + # assign in *either* way.. + if write_form == "strings": + v[:] = write_strings + else: + v[:] = write_bytes + + # .. the result should be the same + ds.close() + check_raw_content(path, "vxs", write_bytes) + + +class TestRead: + """Test how character data is read and converted to strings. + + N.B. many testcases here parallel the 'TestWriteStrings' : we are creating test + datafiles with 'make_dataset' and assigning raw bytes, as-per 'TestWriteChars'. + + We are mostly checking here that reading back produces string arrays as expected. + However, it is simple + convenient to also check the 'DECODE_TO_STRINGS_ON_READ' + function here, i.e. "raw" bytes reads. So that is also done in this class. + """ + + @pytest.fixture(params=["strings", "bytes"]) + def readmode(self, request): + return request.param + + def test_encodings(self, encoding, tempdir, readmode): + # Create a dataset with the variable + path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc" + + if encoding in [None, "ascii"]: + write_strings = samples_3_ascii + write_encoding = "ascii" + else: + write_strings = samples_3_nonascii + write_encoding = encoding + + write_strings = write_strings.copy() # just for safety? + strlen = strings_maxbytes(write_strings, write_encoding) + write_bytes = make_bytearray(write_strings, strlen, encoding=write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + v[:] = write_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = write_strings + if encoding == "utf-8": + # In this case, with the given non-ascii sample data, the + # "default minimum string length" is overestimated. + assert strlen == 7 and result.dtype == "U7" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + # Also check that content is the same (i.e. not actually truncated) + assert np.all(truncated_result == result) + result = truncated_result + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = write_bytes + + check_array_matching(result, expected) + + def test_scalar(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / f"test_read_scalar_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + data_string = "stuff" + data_bytes = make_bytearray(data_string, 5) + + # Checks that we *can* write a string + v[:] = data_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(data_string) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = data_bytes + + check_array_matching(result, expected) + + def test_multidim(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / f"test_read_multidim_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_strings = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + test_bytes = make_bytearray(test_strings, strlen) + v[:] = test_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(test_strings) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = test_bytes + + check_array_matching(result, expected) + + def test_read_encoding_failure(self, tempdir, readmode): + path = tempdir / f"test_read_encoding_failure_{readmode}.nc" + strlen = 10 + ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + if readmode == "strings": + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'ascii' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + else: + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] # this ought to be ok! + + assert np.all(result == test_utf8_bytes) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py new file mode 100644 index 0000000000..0c5d2b279e --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py @@ -0,0 +1,96 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Temporary code to confirm how various numpy dtypes are stored in a netcdf file.""" + +import netCDF4 as nc +import numpy as np +import pytest + +from iris.tests.integration.netcdf.test_chararrays import ncdump + +# types = [ +# "i1", # np.int8 +# "u1", # np.uint8 +# "S1", # np.byte_ +# "U1", # np.str_ +# "S", # multibytes +# "U", # unicode strings, with/without non-ascii content +# ] + +samples = { + "i1": [-5, 7, 35], # np.int8 + "u1": [65, 67, 90], # np.uint8 + "S1": [b"A", b"B", b"Z"], # np.byte_ + "U1": ["A", "B", "C"], # np.str_ + "S": [b"one21", b"three", b""], # multibyte + "U": ["one", "éclair", "nine"], # unicode strings +} +sample_arrays = { + type_code: np.array(values, dtype=type_code) + for type_code, values in samples.items() +} + + +@pytest.fixture(scope="module") +def tmpdir(tmp_path_factory): + return tmp_path_factory.mktemp("netcdf") + + +def create_file(array: np.ndarray, path): + with nc.Dataset(str(path), "w") as ds: + ds.createDimension("x", 3) + v = ds.createVariable("vx", array.dtype, ("x",)) + # v.set_auto_chartostring(False) + v._Encoding = "UTF-8" if array.dtype.kind == "U" else "ascii" + v[:] = array + + +def get_loadback_array(path): + with nc.Dataset(str(path), "r") as ds: + v = ds.variables["vx"] + v.set_auto_chartostring(False) + result = v[:] + return result + + +@pytest.mark.parametrize("dtype", list(samples.keys())) +def test(tmpdir, dtype): + arr = sample_arrays[dtype] + print("\n---") + print(dtype) + path = tmpdir / f"tmp_{dtype}.nc" + create_file(arr, path) + ncdump(path, "-s") + loadback_array = get_loadback_array(path) + print(f" SPEC:{dtype} SAVED-AS:{arr.dtype} RELOAD-AS:{loadback_array.dtype}") + + +# from iris.tests import env_bin_path +# NCGEN_PATHSTR = str(env_bin_path("ncgen")) +# +# +# def ncgen(cdl_path, nc_path, *args): +# """Call ncdump to print a dump of a file.""" +# args = list(args) +# if not any(arg.startswith('-k') for arg in args): +# args[:0] = ["-k", "nc4"] # force netcdf4 +# call_args = [NCGEN_PATHSTR] + list(args) + [str(cdl_path), '-o', str(nc_path)] +# subprocess.check_call(call_args) +# +# +# def test_uchar(tmpdir): +# arr = sample_arrays["S1"] +# path = tmpdir / f"tmp_ichar.nc" +# create_file(arr, path) +# text = ncdump(path, "-s") +# text_u = text.replace("\t", " ") +# text_u = text_u.replace(" char ", " unsigned char ") +# cdl_path = tmpdir / f"tmp_uchar.cdl" +# with open(cdl_path, "w") as f_out: +# f_out.write(text_u) +# nc_path_2 = tmpdir / f"tmp_uchar.nc" +# ncgen(cdl_path, nc_path_2) +# loadback_array = get_loadback_array(nc_path_2) +# print(f" netcdf type 'uchar' LOADS-AS:{loadback_array.dtype}") diff --git a/noxfile.py b/noxfile.py index 415e4fc3d5..e8d5555401 100644 --- a/noxfile.py +++ b/noxfile.py @@ -183,6 +183,7 @@ def tests(session: nox.sessions.Session): session.env.update(ENV) run_args = [ "pytest", + "-v", "-n", "auto", "lib/iris/tests",