From a65aaea8e714c180d5b02ed4c96c17bb2f97b0c7 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 6 Feb 2025 17:50:48 +0100 Subject: [PATCH] Improve speed of loading small NetCDF files (#6229) * Read list of variables only once * Add whatsnew * Improve whatsnew * Add benchmark for files with many cubes --- benchmarks/benchmarks/load/__init__.py | 52 ++++++++++++++++++++++++++ docs/src/whatsnew/latest.rst | 3 +- lib/iris/cube.py | 3 +- lib/iris/fileformats/cf.py | 30 +++++++-------- 4 files changed, 71 insertions(+), 17 deletions(-) diff --git a/benchmarks/benchmarks/load/__init__.py b/benchmarks/benchmarks/load/__init__.py index a4dfb40d19..5c5a62a515 100644 --- a/benchmarks/benchmarks/load/__init__.py +++ b/benchmarks/benchmarks/load/__init__.py @@ -132,6 +132,58 @@ def time_many_var_load(self) -> None: _ = load(str(self.FILE_PATH)) +class ManyCubes: + FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc" + + @staticmethod + def _create_file(save_path: str) -> None: + """Run externally - everything must be self-contained.""" + import numpy as np + + from iris import save + from iris.coords import AuxCoord, DimCoord + from iris.cube import Cube, CubeList + + data_len = 81920 + bnds_len = 3 + data = np.arange(data_len).astype(np.float32) + bnds_data = ( + np.arange(data_len * bnds_len) + .astype(np.float32) + .reshape(data_len, bnds_len) + ) + time = DimCoord(np.array([0]), standard_name="time") + lat = AuxCoord( + data, bounds=bnds_data, standard_name="latitude", units="degrees" + ) + lon = AuxCoord( + data, bounds=bnds_data, standard_name="longitude", units="degrees" + ) + cube = Cube(data.reshape(1, -1), units="unknown") + cube.add_dim_coord(time, 0) + cube.add_aux_coord(lat, 1) + cube.add_aux_coord(lon, 1) + + n_cubes = 100 + cubes = CubeList() + for i in range(n_cubes): + cube = cube.copy() + cube.long_name = f"var_{i}" + cubes.append(cube) + save(cubes, save_path) + + def setup_cache(self) -> None: + if not REUSE_DATA or not self.FILE_PATH.is_file(): + # See :mod:`benchmarks.generate_data` docstring for full explanation. + _ = run_function_elsewhere( + self._create_file, + str(self.FILE_PATH), + ) + + def time_many_cube_load(self) -> None: + _ = load(str(self.FILE_PATH)) + + class StructuredFF: """Test structured loading of a large-ish fieldsfile. diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index e377805866..74d090a006 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -60,7 +60,8 @@ This document explains the changes made to Iris for this release 🚀 Performance Enhancements =========================== -#. N/A +#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF + files faster. (:pull:`6229`) #. `@fnattino`_ enabled lazy cube interpolation using the linear and nearest-neighbour interpolators (:class:`iris.analysis.Linear` and diff --git a/lib/iris/cube.py b/lib/iris/cube.py index e84ff202b9..bb11f65440 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -924,7 +924,8 @@ def __eq__(self, other): # For equality, require both globals + locals to match exactly. # NOTE: array content works correctly, since 'locals' and 'globals' are always # iris.common.mixin.LimitedAttributeDict, which gets this right. - other = CubeAttrsDict(other) + if not isinstance(other, CubeAttrsDict): + other = CubeAttrsDict(other) result = self.locals == other.locals and self.globals == other.globals return result diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index e7b0d8063e..82be010c6e 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -1336,9 +1336,11 @@ def __init__(self, file_source, warn=False, monotonic=False): self._trim_ugrid_variable_types() self._with_ugrid = False - self._translate() - self._build_cf_groups() - self._reset() + # Read the variables in the dataset only once to reduce runtime. + variables = self._dataset.variables + self._translate(variables) + self._build_cf_groups(variables) + self._reset(variables) def __enter__(self): # Enable use as a context manager @@ -1380,16 +1382,16 @@ def filename(self): def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self._filename) - def _translate(self): + def _translate(self, variables): """Classify the netCDF variables into CF-netCDF variables.""" - netcdf_variable_names = list(self._dataset.variables.keys()) + netcdf_variable_names = list(variables.keys()) # Identify all CF coordinate variables first. This must be done # first as, by CF convention, the definition of a CF auxiliary # coordinate variable may include a scalar CF coordinate variable, # whereas we want these two types of variables to be mutually exclusive. coords = CFCoordinateVariable.identify( - self._dataset.variables, monotonic=self._check_monotonic + variables, monotonic=self._check_monotonic ) self.cf_group.update(coords) coordinate_names = list(self.cf_group.coordinates.keys()) @@ -1402,9 +1404,7 @@ def _translate(self): if issubclass(variable_type, CFGridMappingVariable) else coordinate_names ) - self.cf_group.update( - variable_type.identify(self._dataset.variables, ignore=ignore) - ) + self.cf_group.update(variable_type.identify(variables, ignore=ignore)) # Identify global netCDF attributes. attr_dict = { @@ -1414,7 +1414,7 @@ def _translate(self): self.cf_group.global_attributes.update(attr_dict) # Identify and register all CF formula terms. - formula_terms = _CFFormulaTermsVariable.identify(self._dataset.variables) + formula_terms = _CFFormulaTermsVariable.identify(variables) for cf_var in formula_terms.values(): for cf_root, cf_term in cf_var.cf_terms_by_root.items(): @@ -1433,9 +1433,9 @@ def _translate(self): ) for name in data_variable_names: - self.cf_group[name] = CFDataVariable(name, self._dataset.variables[name]) + self.cf_group[name] = CFDataVariable(name, variables[name]) - def _build_cf_groups(self): + def _build_cf_groups(self, variables): """Build the first order relationships between CF-netCDF variables.""" def _build(cf_variable): @@ -1489,7 +1489,7 @@ def _span_check( ignore += coordinate_names match = variable_type.identify( - self._dataset.variables, + variables, ignore=ignore, target=cf_variable.cf_name, warn=False, @@ -1569,9 +1569,9 @@ def _span_check( promoted.add(cf_name) not_promoted = ignored.difference(promoted) - def _reset(self): + def _reset(self, variables): """Reset the attribute touch history of each variable.""" - for nc_var_name in self._dataset.variables.keys(): + for nc_var_name in variables.keys(): self.cf_group[nc_var_name].cf_attrs_reset() def _close(self):