From 699e3de04d12a6a0884aa6cd2be7caa47bb43304 Mon Sep 17 00:00:00 2001 From: Olivier Iffrig Date: Thu, 5 Jun 2025 15:11:26 +0000 Subject: [PATCH 1/3] Add bootstrapping helper --- src/earthkit/meteo/score/bootstrap.py | 128 ++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 src/earthkit/meteo/score/bootstrap.py diff --git a/src/earthkit/meteo/score/bootstrap.py b/src/earthkit/meteo/score/bootstrap.py new file mode 100644 index 0000000..dbccc27 --- /dev/null +++ b/src/earthkit/meteo/score/bootstrap.py @@ -0,0 +1,128 @@ +import functools +import random +from dataclasses import dataclass + +from earthkit.utils.array import array_namespace + + +@dataclass +class BootstrapResult: + """Container for the result of a bootstrapping process""" + + #: number of bootstrapping iterations + n_iter: int + + #: number of samples for each iteration + n_samples: int + + #: return values for each iteration, first dimension is the iteration + results: object + + def quantiles(self, threshold: float): + """Compute quantiles associated with the given threshold + + The first dimension of the returned array has length 2 corresponding to + the low (``threshold``) and high (``1 - threshold``) quantiles + """ + return self._xp.quantile(self.results, [threshold, 1.0 - threshold], axis=0) + + def sig_mask(self, threshold: float, sig_value: float): + """Compute the significance mask for the given threshold and significant value""" + low, high = self.quantiles(threshold) + return self._xp.logical_or(high < sig_value, low > sig_value) + + @property + def _xp(self): + return array_namespace(self.results) + + +class Bootstrappable: + def __init__(self, func): + self.func = func + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def bootstrap( + self, + x, + y, + *args, + sample_axis=0, + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, + ): + """Run bootstrapping + + Parameters + ---------- + x, y: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + *args + Additional positional arguments to the wrapped function + sample_axis: int + Sample along this axis + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + **kwargs + Additional keyword arguments to the wrapped function + + Returns + ------- + BootstrapResult + Aggregated results of the bootstrapping process + """ + xp = array_namespace(x, y) + x = xp.asarray(x) + y = xp.asarray(y) + n_inputs = x.shape[sample_axis] + assert ( + y.shape[sample_axis] == n_inputs + ), "Input arrays must have the same size along the first dimension" + if n_samples is None: + n_samples = n_inputs + results = [] + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + x_sample = xp.take(x, indices=indices, axis=sample_axis) + y_sample = xp.take(y, indices=indices, axis=sample_axis) + results.append(self.func(x_sample, y_sample, *args, **kwargs)) + return BootstrapResult(n_iter, n_samples, xp.stack(results, axis=0)) + + +def enable_bootstrap(func): + """Enable bootstrapping on a binary scoring function + + The function will be wrapped in a callable object (see + :class:`Bootstrappable`). Calling the object will call the function + directly. The object will have a :meth:`Bootstrappable.bootstrap` method to + run bootstrapping. + + Examples + -------- + + :: + @enable_bootstrap + def difference(x, y): + return y - x + + x = ... + y = ... + difference(x, y) # normal call, return y - x + bresult = difference.bootstrap(x, y) # bootstrapping + bresult.quantiles(0.2) + bresult.sig_mask(0.05, 0.0) + """ + + wrapper = Bootstrappable(func) + functools.update_wrapper(wrapper, func) + return wrapper From 7252ba4469139de73388f0c7c6aa7acfd5bb9866 Mon Sep 17 00:00:00 2001 From: Olivier Iffrig Date: Wed, 18 Jun 2025 14:28:15 +0000 Subject: [PATCH 2/3] Rework bootstrapping helper --- src/earthkit/meteo/score/bootstrap.py | 86 +++++++++++++++++++-------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/src/earthkit/meteo/score/bootstrap.py b/src/earthkit/meteo/score/bootstrap.py index dbccc27..b5f8f6d 100644 --- a/src/earthkit/meteo/score/bootstrap.py +++ b/src/earthkit/meteo/score/bootstrap.py @@ -36,6 +36,50 @@ def _xp(self): return array_namespace(self.results) +def resample(x, *args, sample_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): + """Resample arrays for bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + + Yields + ------ + tuple + Resampled arrays (one for each iteration) + """ + args = (x,) + args + n_arrays = len(args) + if isinstance(sample_axis, int): + sample_axis = [sample_axis for _ in range(n_arrays)] + else: + assert len(sample_axis) == n_arrays, "sample_axis must have one element per input array" + xp = array_namespace(*args) + arrays = tuple((xp.asarray(arr), axis) for arr, axis in zip(args, sample_axis)) + n_inputs = x.shape[sample_axis[0]] + assert all( + y.shape[axis] == n_inputs for y, axis in arrays + ), "Input arrays must have the same size along the sampling axis" + if n_samples is None: + n_samples = n_inputs + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + sampled = tuple(xp.take(y, indices=indices, axis=axis) for y, axis in arrays) + yield sampled + + class Bootstrappable: def __init__(self, func): self.func = func @@ -46,7 +90,6 @@ def __call__(self, *args, **kwargs): def bootstrap( self, x, - y, *args, sample_axis=0, n_iter=100, @@ -58,13 +101,11 @@ def bootstrap( Parameters ---------- - x, y: array-like + x, *args: array-like Inputs to the wrapped function, sampled for bootstrapping. Must have the same size along ``sample_axis`` - *args - Additional positional arguments to the wrapped function - sample_axis: int - Sample along this axis + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) n_iter: int Number of bootstrapping iterations n_samples: int or None @@ -81,21 +122,16 @@ def bootstrap( BootstrapResult Aggregated results of the bootstrapping process """ - xp = array_namespace(x, y) - x = xp.asarray(x) - y = xp.asarray(y) - n_inputs = x.shape[sample_axis] - assert ( - y.shape[sample_axis] == n_inputs - ), "Input arrays must have the same size along the first dimension" - if n_samples is None: - n_samples = n_inputs - results = [] - for _ in range(n_iter): - indices = [randrange(n_inputs) for _ in range(n_samples)] - x_sample = xp.take(x, indices=indices, axis=sample_axis) - y_sample = xp.take(y, indices=indices, axis=sample_axis) - results.append(self.func(x_sample, y_sample, *args, **kwargs)) + xp = array_namespace(x, *args) + samples = resample( + x, + *args, + sample_axis=sample_axis, + n_iter=n_iter, + n_samples=n_samples, + randrange=randrange, + ) + results = [self.func(*sampled, **kwargs) for sampled in samples] return BootstrapResult(n_iter, n_samples, xp.stack(results, axis=0)) @@ -112,13 +148,13 @@ def enable_bootstrap(func): :: @enable_bootstrap - def difference(x, y): - return y - x + def mse(x, y, axis=-1): + return np.mean(np.square(y - x), axis=axis) x = ... y = ... - difference(x, y) # normal call, return y - x - bresult = difference.bootstrap(x, y) # bootstrapping + mse(x, y) # normal call, return MSE + bresult = mse.bootstrap(x, y) # bootstrapping bresult.quantiles(0.2) bresult.sig_mask(0.05, 0.0) """ From 780c0f3952ec315062c6e4ee6dce930f88418c10 Mon Sep 17 00:00:00 2001 From: Olivier Iffrig Date: Tue, 24 Jun 2025 09:34:28 +0000 Subject: [PATCH 3/3] Support xarrays in bootstrap --- src/earthkit/meteo/score/__init__.py | 1 + src/earthkit/meteo/score/array/__init__.py | 1 + src/earthkit/meteo/score/array/bootstrap.py | 137 ++++++++++++++++++++ src/earthkit/meteo/score/bootstrap.py | 111 +++++++++------- 4 files changed, 202 insertions(+), 48 deletions(-) create mode 100644 src/earthkit/meteo/score/array/bootstrap.py diff --git a/src/earthkit/meteo/score/__init__.py b/src/earthkit/meteo/score/__init__.py index d389998..a233d3b 100644 --- a/src/earthkit/meteo/score/__init__.py +++ b/src/earthkit/meteo/score/__init__.py @@ -15,5 +15,6 @@ planned to work with objects like *earthkit.data FieldLists* or *xarray DataSets*. """ +from .bootstrap import * # noqa from .correlation import * # noqa from .crps import * # noqa diff --git a/src/earthkit/meteo/score/array/__init__.py b/src/earthkit/meteo/score/array/__init__.py index e231eef..a1a1d60 100644 --- a/src/earthkit/meteo/score/array/__init__.py +++ b/src/earthkit/meteo/score/array/__init__.py @@ -11,5 +11,6 @@ Verification functions operating on numpy arrays. """ +from .bootstrap import * # noqa from .correlation import * # noqa from .crps import * # noqa diff --git a/src/earthkit/meteo/score/array/bootstrap.py b/src/earthkit/meteo/score/array/bootstrap.py new file mode 100644 index 0000000..b71d3d0 --- /dev/null +++ b/src/earthkit/meteo/score/array/bootstrap.py @@ -0,0 +1,137 @@ +import random + +from earthkit.utils.array import array_namespace + + +def iter_samples(x, *args, sample_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): + """Iterate over resampled arrays for bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + + Yields + ------ + tuple + Resampled arrays (one for each iteration) + """ + args = (x,) + args + n_arrays = len(args) + if isinstance(sample_axis, int): + sample_axis = [sample_axis for _ in range(n_arrays)] + else: + assert len(sample_axis) == n_arrays, "sample_axis must have one element per input array" + xp = array_namespace(*args) + arrays = tuple((xp.asarray(arr), axis) for arr, axis in zip(args, sample_axis)) + n_inputs = x.shape[sample_axis[0]] + assert all( + y.shape[axis] == n_inputs for y, axis in arrays + ), "Input arrays must have the same size along the sampling axis" + if n_samples is None: + n_samples = n_inputs + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + sampled = tuple(xp.take(y, indices=indices, axis=axis) for y, axis in arrays) + yield sampled + + +def resample(x, *args, sample_axis=0, out_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): + """Resample arrays for bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + out_axis: int + Stack samples along this axis + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + + Returns + ------ + tuple + Resampled arrays (one for each iteration) + """ + xp = array_namespace(x, *args) + n_arrays = len(args) + 1 + samples = [[] for _ in range(n_arrays)] + samples_it = iter_samples( + x, *args, sample_axis=sample_axis, n_iter=n_iter, n_samples=n_samples, randrange=randrange + ) + for sample in samples_it: + for i in range(n_arrays): + samples[i].append(sample[i]) + return tuple(xp.stack(sampled_arr, axis=out_axis) for sampled_arr in samples) + + +def bootstrap( + func, + x, + *args, + sample_axis=0, + out_axis=0, + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, +): + """Run bootstrapping + + Parameters + ---------- + func: function ((array, ..., **kwargs) -> array) + Function to bootstrap + x, *args: array-like + Inputs to ``function``, sampled for bootstrapping. Must have the same + size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + out_axis: int + Stack samples along this axis + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + **kwargs + Additional keyword arguments to ``func`` + + Returns + ------- + array-like + Aggregated results of the bootstrapping process + """ + xp = array_namespace(x, *args) + samples = iter_samples( + x, + *args, + sample_axis=sample_axis, + n_iter=n_iter, + n_samples=n_samples, + randrange=randrange, + ) + return xp.stack([func(*sampled, **kwargs) for sampled in samples], axis=out_axis) diff --git a/src/earthkit/meteo/score/bootstrap.py b/src/earthkit/meteo/score/bootstrap.py index b5f8f6d..cd64c89 100644 --- a/src/earthkit/meteo/score/bootstrap.py +++ b/src/earthkit/meteo/score/bootstrap.py @@ -4,6 +4,66 @@ from earthkit.utils.array import array_namespace +try: + import xarray as xr +except ModuleNotFoundError: + xr = None + +from . import array + + +def resample(x, *args, **kwargs): + if xr is not None and isinstance(x, xr.DataArray): + n_arrays = len(args) + 1 + dim = kwargs.get("dim", None) + if dim is None: + raise TypeError("resample with xarray arguments requires 'dim'") + in_dims = [[dim] for _ in range(n_arrays)] + sample_dim = kwargs.get("sample_dim", "sample") + out_dims = [[sample_dim] for _ in range(n_arrays)] + return xr.apply_ufunc( + functools.partial(array.resample, sample_axis=-1, out_axis=-1, **kwargs), + x, + *args, + input_core_dims=in_dims, + output_core_dims=out_dims, + ) + + return array.resample(x, *args, **kwargs) + + +def _bootstrap_xarray( + func, + *args, + dim=None, + sample_dim="sample", + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, +): + assert xr is not None + if dim is None: + raise TypeError("bootstrap with xarray arguments requires 'dim'") + n_inputs = args[0].sizes[dim] + assert all( + arr.sizes[dim] == n_inputs for arr in args + ), "Input arrays must have the same size along the sampling axis" + if n_samples is None: + n_samples = n_inputs + results = [] + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + sampled = tuple(arr.isel({dim: indices}) for arr in args) + results.append(func(*sampled, **kwargs)) + return xr.concat(results, sample_dim) + + +def bootstrap(func, x, *args, **kwargs): + if xr is not None and isinstance(x, xr.DataArray): + return _bootstrap_xarray(func, x, *args, **kwargs) + return array.bootstrap(func, x, *args, **kwargs) + @dataclass class BootstrapResult: @@ -36,50 +96,6 @@ def _xp(self): return array_namespace(self.results) -def resample(x, *args, sample_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): - """Resample arrays for bootstrapping - - Parameters - ---------- - x, *args: array-like - Inputs to the wrapped function, sampled for bootstrapping. Must have - the same size along ``sample_axis`` - sample_axis: int or list of int - Sample along this axis (either same for all or one per argument) - n_iter: int - Number of bootstrapping iterations - n_samples: int or None - Number of samples for each iteration. If None, use the number of - inputs (size of ``x`` along the sampling axis) - randrange: function (int -> int) - Random generator for integers: `randrange(n)` should return an - integer in `range(n)` - - Yields - ------ - tuple - Resampled arrays (one for each iteration) - """ - args = (x,) + args - n_arrays = len(args) - if isinstance(sample_axis, int): - sample_axis = [sample_axis for _ in range(n_arrays)] - else: - assert len(sample_axis) == n_arrays, "sample_axis must have one element per input array" - xp = array_namespace(*args) - arrays = tuple((xp.asarray(arr), axis) for arr, axis in zip(args, sample_axis)) - n_inputs = x.shape[sample_axis[0]] - assert all( - y.shape[axis] == n_inputs for y, axis in arrays - ), "Input arrays must have the same size along the sampling axis" - if n_samples is None: - n_samples = n_inputs - for _ in range(n_iter): - indices = [randrange(n_inputs) for _ in range(n_samples)] - sampled = tuple(xp.take(y, indices=indices, axis=axis) for y, axis in arrays) - yield sampled - - class Bootstrappable: def __init__(self, func): self.func = func @@ -122,8 +138,8 @@ def bootstrap( BootstrapResult Aggregated results of the bootstrapping process """ - xp = array_namespace(x, *args) - samples = resample( + results = bootstrap( + self.func, x, *args, sample_axis=sample_axis, @@ -131,8 +147,7 @@ def bootstrap( n_samples=n_samples, randrange=randrange, ) - results = [self.func(*sampled, **kwargs) for sampled in samples] - return BootstrapResult(n_iter, n_samples, xp.stack(results, axis=0)) + return BootstrapResult(n_iter, n_samples, results) def enable_bootstrap(func):