Skip to content
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions lib/iris/fileformats/_nc_load_rules/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine):
),
)
if problem is not None:
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Skipping disallowed global attribute '{attr_name}' (see above error)"
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]


################################################################################
Expand Down Expand Up @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate(
)
if problem is not None:
coord_var_name = str(cf_coord_var.cf_name)
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Failed to create {coord_var_name} dimension coordinate:\n"
f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead."
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]
problem.handled = True

_ = _add_or_capture(
Expand Down Expand Up @@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(

# Determine the name of the dimension/s shared between the CF-netCDF data variable
# and the coordinate being built.
common_dims = [
dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
]
coord_dims = cf_coord_var.dimensions
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: this possibly needs to be implemented for ancillary-variables too

  • which might also be strings
  • which is awkward because of DRY failure in rules code

if cf._is_str_dtype(cf_coord_var):
coord_dims = coord_dims[:-1]
datavar_dims = engine.cf_var.dimensions
if cf._is_str_dtype(engine.cf_var):
datavar_dims = datavar_dims[:-1]
common_dims = [dim for dim in coord_dims if dim in datavar_dims]
data_dims = None
if common_dims:
# Calculate the offset of each common dimension.
Expand Down
73 changes: 64 additions & 9 deletions lib/iris/fileformats/cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""

from abc import ABCMeta, abstractmethod
import codecs
from collections.abc import Iterable, MutableMapping
import os
import re
Expand Down Expand Up @@ -89,6 +90,11 @@ def __init__(self, name, data):

self.cf_data = data
"""NetCDF4 Variable data instance."""
# Note: *always* disable encoding/decoding translations
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
data.set_auto_chartostring(False)
# ALSO NOTE: not stored. NetCDFDataProxy must re-assert when re-loading.

"""File source of the NetCDF content."""
try:
Expand Down Expand Up @@ -790,25 +796,73 @@ def cf_label_data(self, cf_data_var):

# Determine the name of the label string (or length) dimension by
# finding the dimension name that doesn't exist within the data dimensions.
str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
n_nondata_dims = len(str_dim_names)

if n_nondata_dims == 0:
# *All* dims are shared with the data-variable.
# This is only ok if the data-var is *also* a string type.
dim_ok = _is_str_dtype(cf_data_var)
# In this case, we must just *assume* that the last dimension is "the"
# string dimension
str_dim_name = self.dimensions[-1]
else:
# If there is exactly one non-data dim, that is the one we want
dim_ok = len(str_dim_names) == 1
(str_dim_name,) = str_dim_names

if len(str_dim_name) != 1:
if not dim_ok:
raise ValueError(
"Invalid string dimensions for CF-netCDF label variable %r"
% self.cf_name
)

str_dim_name = str_dim_name[0]
label_data = self[:]

if ma.isMaskedArray(label_data):
label_data = label_data.filled()
label_data = label_data.filled(b"\0")

default_encoding = "utf-8"
encoding = getattr(self, "_Encoding", None)
if encoding is None:
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
encoding = default_encoding
else:
try:
# Accept + normalise naming of encodings
encoding = codecs.lookup(encoding).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Replace some invalid setting with "safe"(ish) fallback.
encoding = default_encoding

def string_from_1d_bytearray(array, encoding):
r"""Because numpy bytes arrays behave very oddly.

Elements which "should" contain a zero byte b'\0' instead appear to contain
an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes.
"""
assert array.dtype.kind == "S" and array.dtype.itemsize == 1
assert array.ndim == 1
bytelist = [b"\0" if byte == b"" else byte for byte in array]
bytes = b"".join(bytelist)
assert len(bytes) == array.shape[0]
try:
string = bytes.decode(encoding=encoding)
except UnicodeDecodeError:
# if encoding == "ascii":
# print("\n\n*** FIX !!")
# string = bytes.decode("utf-8")
# else:
Comment on lines +854 to +857
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: remove

raise
result = string.strip()
return result

# Determine whether we have a string-valued scalar label
# i.e. a character variable that only has one dimension (the length of the string).
if self.ndim == 1:
label_string = b"".join(label_data).strip()
label_string = label_string.decode("utf8")
label_string = string_from_1d_bytearray(label_data, encoding)
data = np.array([label_string])
else:
# Determine the index of the string dimension.
Expand All @@ -829,9 +883,10 @@ def cf_label_data(self, cf_data_var):
else:
label_index = index + (slice(None, None),)

label_string = b"".join(label_data[label_index]).strip()
label_string = label_string.decode("utf8")
data[index] = label_string
label_string = string_from_1d_bytearray(
label_data[label_index], encoding
)
data[index] = label_string.strip()

return data

Expand Down
52 changes: 47 additions & 5 deletions lib/iris/fileformats/netcdf/_thread_safe_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs):
class NetCDFDataProxy:
"""A reference to the data payload of a single NetCDF file variable."""

__slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")

def __init__(self, shape, dtype, path, variable_name, fill_value):
__slots__ = (
"shape",
"dtype",
"path",
"variable_name",
"fill_value",
"is_bytes",
"encoding",
"string_length",
)

def __init__(
self,
shape,
dtype,
path,
variable_name,
fill_value,
encoding: str | None = None,
string_length: int = 0,
):
self.shape = shape
self.dtype = dtype
self.path = path
self.variable_name = variable_name
self.fill_value = fill_value
self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
if self.is_bytes:
# We will be returning a different shape : the last dim is the byte-length
self.shape = self.shape[:-1]
self.dtype = np.dtype(f"U{string_length}")
self.encoding = encoding
self.string_length = string_length

@property
def ndim(self):
Expand All @@ -336,11 +361,26 @@ def __getitem__(self, keys):
dataset = netCDF4.Dataset(self.path)
try:
variable = dataset.variables[self.variable_name]
# ALWAYS disable byte encoding/decoding
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
variable.set_auto_chartostring(False)

# Get the NetCDF variable data and slice.
var = variable[keys]
data = variable[keys]

# If bytes, decode to strings
if self.is_bytes:
from iris.util import convert_bytesarray_to_strings

data = convert_bytesarray_to_strings(
data,
encoding=self.encoding,
string_length=self.string_length,
)
finally:
dataset.close()
return np.asanyarray(var)
return np.asanyarray(data)

def __repr__(self):
fmt = (
Expand Down Expand Up @@ -388,6 +428,8 @@ def __setitem__(self, keys, array_data):
try:
dataset = netCDF4.Dataset(self.path, "r+")
var = dataset.variables[self.varname]
# **Always** disable encode/decode of bytes to strings
var.set_auto_chartostring(False)
var[keys] = array_data
finally:
try:
Expand Down
38 changes: 37 additions & 1 deletion lib/iris/fileformats/netcdf/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

"""

import codecs
from collections.abc import Iterable, Iterator, Mapping
from contextlib import contextmanager
from copy import deepcopy
Expand Down Expand Up @@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
# Normal NCVariable type:
total_bytes = cf_var.size * cf_var.dtype.itemsize

default_encoding = "utf-8"
encoding = getattr(cf_var, "_Encoding", None)
if encoding is None:
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
encoding = default_encoding
else:
try:
# Accept + normalise naming of encodings
encoding = codecs.lookup(encoding).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Replace some invalid setting with "safe"(ish) fallback.
encoding = default_encoding

string_length = getattr(cf_var, "iris_string_length", None)

if total_bytes < _LAZYVAR_MIN_BYTES:
# Don't make a lazy array, as it will cost more memory AND more time to access.
result = cf_var[:]

if result.dtype.kind == "S":
from iris.util import convert_bytesarray_to_strings

result = convert_bytesarray_to_strings(
result,
encoding=encoding,
string_length=string_length,
)

# Special handling of masked scalar value; this will be returned as
# an `np.ma.masked` instance which will lose the original dtype.
# Workaround for this it return a 1-element masked array of the
Expand All @@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
"_FillValue",
_thread_safe_nc.default_fillvals[fill_dtype],
)

# NOTE: if the data is bytes which need to be converted to strings on read,
# the data-proxy will do that (and it modifies its shape + dtype).
proxy = NetCDFDataProxy(
cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
cf_var.shape,
dtype,
cf_var.filename,
cf_var.cf_name,
fill_value,
encoding=encoding,
string_length=string_length,
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
Expand Down
Loading
Loading