Skip to content

Commit

Permalink
Improve speed of loading small NetCDF files (#6229)
Browse files Browse the repository at this point in the history
* Read list of variables only once

* Add whatsnew

* Improve whatsnew

* Add benchmark for files with many cubes
  • Loading branch information
bouweandela authored Feb 6, 2025
1 parent b178f6f commit a65aaea
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 17 deletions.
52 changes: 52 additions & 0 deletions benchmarks/benchmarks/load/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,58 @@ def time_many_var_load(self) -> None:
_ = load(str(self.FILE_PATH))


class ManyCubes:
FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc"

@staticmethod
def _create_file(save_path: str) -> None:
"""Run externally - everything must be self-contained."""
import numpy as np

from iris import save
from iris.coords import AuxCoord, DimCoord
from iris.cube import Cube, CubeList

data_len = 81920
bnds_len = 3
data = np.arange(data_len).astype(np.float32)
bnds_data = (
np.arange(data_len * bnds_len)
.astype(np.float32)
.reshape(data_len, bnds_len)
)
time = DimCoord(np.array([0]), standard_name="time")
lat = AuxCoord(
data, bounds=bnds_data, standard_name="latitude", units="degrees"
)
lon = AuxCoord(
data, bounds=bnds_data, standard_name="longitude", units="degrees"
)
cube = Cube(data.reshape(1, -1), units="unknown")
cube.add_dim_coord(time, 0)
cube.add_aux_coord(lat, 1)
cube.add_aux_coord(lon, 1)

n_cubes = 100
cubes = CubeList()
for i in range(n_cubes):
cube = cube.copy()
cube.long_name = f"var_{i}"
cubes.append(cube)
save(cubes, save_path)

def setup_cache(self) -> None:
if not REUSE_DATA or not self.FILE_PATH.is_file():
# See :mod:`benchmarks.generate_data` docstring for full explanation.
_ = run_function_elsewhere(
self._create_file,
str(self.FILE_PATH),
)

def time_many_cube_load(self) -> None:
_ = load(str(self.FILE_PATH))


class StructuredFF:
"""Test structured loading of a large-ish fieldsfile.
Expand Down
3 changes: 2 additions & 1 deletion docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ This document explains the changes made to Iris for this release
🚀 Performance Enhancements
===========================

#. N/A
#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF
files faster. (:pull:`6229`)

#. `@fnattino`_ enabled lazy cube interpolation using the linear and
nearest-neighbour interpolators (:class:`iris.analysis.Linear` and
Expand Down
3 changes: 2 additions & 1 deletion lib/iris/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,8 @@ def __eq__(self, other):
# For equality, require both globals + locals to match exactly.
# NOTE: array content works correctly, since 'locals' and 'globals' are always
# iris.common.mixin.LimitedAttributeDict, which gets this right.
other = CubeAttrsDict(other)
if not isinstance(other, CubeAttrsDict):
other = CubeAttrsDict(other)
result = self.locals == other.locals and self.globals == other.globals
return result

Expand Down
30 changes: 15 additions & 15 deletions lib/iris/fileformats/cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,9 +1336,11 @@ def __init__(self, file_source, warn=False, monotonic=False):
self._trim_ugrid_variable_types()
self._with_ugrid = False

self._translate()
self._build_cf_groups()
self._reset()
# Read the variables in the dataset only once to reduce runtime.
variables = self._dataset.variables
self._translate(variables)
self._build_cf_groups(variables)
self._reset(variables)

def __enter__(self):
# Enable use as a context manager
Expand Down Expand Up @@ -1380,16 +1382,16 @@ def filename(self):
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self._filename)

def _translate(self):
def _translate(self, variables):
"""Classify the netCDF variables into CF-netCDF variables."""
netcdf_variable_names = list(self._dataset.variables.keys())
netcdf_variable_names = list(variables.keys())

# Identify all CF coordinate variables first. This must be done
# first as, by CF convention, the definition of a CF auxiliary
# coordinate variable may include a scalar CF coordinate variable,
# whereas we want these two types of variables to be mutually exclusive.
coords = CFCoordinateVariable.identify(
self._dataset.variables, monotonic=self._check_monotonic
variables, monotonic=self._check_monotonic
)
self.cf_group.update(coords)
coordinate_names = list(self.cf_group.coordinates.keys())
Expand All @@ -1402,9 +1404,7 @@ def _translate(self):
if issubclass(variable_type, CFGridMappingVariable)
else coordinate_names
)
self.cf_group.update(
variable_type.identify(self._dataset.variables, ignore=ignore)
)
self.cf_group.update(variable_type.identify(variables, ignore=ignore))

# Identify global netCDF attributes.
attr_dict = {
Expand All @@ -1414,7 +1414,7 @@ def _translate(self):
self.cf_group.global_attributes.update(attr_dict)

# Identify and register all CF formula terms.
formula_terms = _CFFormulaTermsVariable.identify(self._dataset.variables)
formula_terms = _CFFormulaTermsVariable.identify(variables)

for cf_var in formula_terms.values():
for cf_root, cf_term in cf_var.cf_terms_by_root.items():
Expand All @@ -1433,9 +1433,9 @@ def _translate(self):
)

for name in data_variable_names:
self.cf_group[name] = CFDataVariable(name, self._dataset.variables[name])
self.cf_group[name] = CFDataVariable(name, variables[name])

def _build_cf_groups(self):
def _build_cf_groups(self, variables):
"""Build the first order relationships between CF-netCDF variables."""

def _build(cf_variable):
Expand Down Expand Up @@ -1489,7 +1489,7 @@ def _span_check(
ignore += coordinate_names

match = variable_type.identify(
self._dataset.variables,
variables,
ignore=ignore,
target=cf_variable.cf_name,
warn=False,
Expand Down Expand Up @@ -1569,9 +1569,9 @@ def _span_check(
promoted.add(cf_name)
not_promoted = ignored.difference(promoted)

def _reset(self):
def _reset(self, variables):
"""Reset the attribute touch history of each variable."""
for nc_var_name in self._dataset.variables.keys():
for nc_var_name in variables.keys():
self.cf_group[nc_var_name].cf_attrs_reset()

def _close(self):
Expand Down

0 comments on commit a65aaea

Please sign in to comment.