Improve speed of loading small NetCDF files (#6229)

* Read list of variables only once * Add whatsnew * Improve whatsnew * Add benchmark for files with many cubes
SciTools · Feb 6, 2025 · a65aaea · a65aaea
1 parent b178f6f
commit a65aaea
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 17 deletions.
diff --git a/benchmarks/benchmarks/load/__init__.py b/benchmarks/benchmarks/load/__init__.py
@@ -132,6 +132,58 @@ def time_many_var_load(self) -> None:
         _ = load(str(self.FILE_PATH))
 
 
+class ManyCubes:
+    FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc"
+
+    @staticmethod
+    def _create_file(save_path: str) -> None:
+        """Run externally - everything must be self-contained."""
+        import numpy as np
+
+        from iris import save
+        from iris.coords import AuxCoord, DimCoord
+        from iris.cube import Cube, CubeList
+
+        data_len = 81920
+        bnds_len = 3
+        data = np.arange(data_len).astype(np.float32)
+        bnds_data = (
+            np.arange(data_len * bnds_len)
+            .astype(np.float32)
+            .reshape(data_len, bnds_len)
+        )
+        time = DimCoord(np.array([0]), standard_name="time")
+        lat = AuxCoord(
+            data, bounds=bnds_data, standard_name="latitude", units="degrees"
+        )
+        lon = AuxCoord(
+            data, bounds=bnds_data, standard_name="longitude", units="degrees"
+        )
+        cube = Cube(data.reshape(1, -1), units="unknown")
+        cube.add_dim_coord(time, 0)
+        cube.add_aux_coord(lat, 1)
+        cube.add_aux_coord(lon, 1)
+
+        n_cubes = 100
+        cubes = CubeList()
+        for i in range(n_cubes):
+            cube = cube.copy()
+            cube.long_name = f"var_{i}"
+            cubes.append(cube)
+        save(cubes, save_path)
+
+    def setup_cache(self) -> None:
+        if not REUSE_DATA or not self.FILE_PATH.is_file():
+            # See :mod:`benchmarks.generate_data` docstring for full explanation.
+            _ = run_function_elsewhere(
+                self._create_file,
+                str(self.FILE_PATH),
+            )
+
+    def time_many_cube_load(self) -> None:
+        _ = load(str(self.FILE_PATH))
+
+
 class StructuredFF:
     """Test structured loading of a large-ish fieldsfile.
 

diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
@@ -60,7 +60,8 @@ This document explains the changes made to Iris for this release
 🚀 Performance Enhancements
 ===========================
 
-#. N/A
+#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF
+   files faster. (:pull:`6229`)
 
 #. `@fnattino`_ enabled lazy cube interpolation using the linear and
    nearest-neighbour interpolators (:class:`iris.analysis.Linear` and

diff --git a/lib/iris/cube.py b/lib/iris/cube.py
@@ -924,7 +924,8 @@ def __eq__(self, other):
         # For equality, require both globals + locals to match exactly.
         # NOTE: array content works correctly, since 'locals' and 'globals' are always
         # iris.common.mixin.LimitedAttributeDict, which gets this right.
-        other = CubeAttrsDict(other)
+        if not isinstance(other, CubeAttrsDict):
+            other = CubeAttrsDict(other)
         result = self.locals == other.locals and self.globals == other.globals
         return result
 

diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
@@ -1336,9 +1336,11 @@ def __init__(self, file_source, warn=False, monotonic=False):
             self._trim_ugrid_variable_types()
             self._with_ugrid = False
 
-        self._translate()
-        self._build_cf_groups()
-        self._reset()
+        # Read the variables in the dataset only once to reduce runtime.
+        variables = self._dataset.variables
+        self._translate(variables)
+        self._build_cf_groups(variables)
+        self._reset(variables)
 
     def __enter__(self):
         # Enable use as a context manager
@@ -1380,16 +1382,16 @@ def filename(self):
     def __repr__(self):
         return "%s(%r)" % (self.__class__.__name__, self._filename)
 
-    def _translate(self):
+    def _translate(self, variables):
         """Classify the netCDF variables into CF-netCDF variables."""
-        netcdf_variable_names = list(self._dataset.variables.keys())
+        netcdf_variable_names = list(variables.keys())
 
         # Identify all CF coordinate variables first. This must be done
         # first as, by CF convention, the definition of a CF auxiliary
         # coordinate variable may include a scalar CF coordinate variable,
         # whereas we want these two types of variables to be mutually exclusive.
         coords = CFCoordinateVariable.identify(
-            self._dataset.variables, monotonic=self._check_monotonic
+            variables, monotonic=self._check_monotonic
         )
         self.cf_group.update(coords)
         coordinate_names = list(self.cf_group.coordinates.keys())
@@ -1402,9 +1404,7 @@ def _translate(self):
                 if issubclass(variable_type, CFGridMappingVariable)
                 else coordinate_names
             )
-            self.cf_group.update(
-                variable_type.identify(self._dataset.variables, ignore=ignore)
-            )
+            self.cf_group.update(variable_type.identify(variables, ignore=ignore))
 
         # Identify global netCDF attributes.
         attr_dict = {
@@ -1414,7 +1414,7 @@ def _translate(self):
         self.cf_group.global_attributes.update(attr_dict)
 
         # Identify and register all CF formula terms.
-        formula_terms = _CFFormulaTermsVariable.identify(self._dataset.variables)
+        formula_terms = _CFFormulaTermsVariable.identify(variables)
 
         for cf_var in formula_terms.values():
             for cf_root, cf_term in cf_var.cf_terms_by_root.items():
@@ -1433,9 +1433,9 @@ def _translate(self):
         )
 
         for name in data_variable_names:
-            self.cf_group[name] = CFDataVariable(name, self._dataset.variables[name])
+            self.cf_group[name] = CFDataVariable(name, variables[name])
 
-    def _build_cf_groups(self):
+    def _build_cf_groups(self, variables):
         """Build the first order relationships between CF-netCDF variables."""
 
         def _build(cf_variable):
@@ -1489,7 +1489,7 @@ def _span_check(
                     ignore += coordinate_names
 
                 match = variable_type.identify(
-                    self._dataset.variables,
+                    variables,
                     ignore=ignore,
                     target=cf_variable.cf_name,
                     warn=False,
@@ -1569,9 +1569,9 @@ def _span_check(
             promoted.add(cf_name)
             not_promoted = ignored.difference(promoted)
 
-    def _reset(self):
+    def _reset(self, variables):
         """Reset the attribute touch history of each variable."""
-        for nc_var_name in self._dataset.variables.keys():
+        for nc_var_name in variables.keys():
             self.cf_group[nc_var_name].cf_attrs_reset()
 
     def _close(self):