diff --git a/src/earthkit/meteo/score/__init__.py b/src/earthkit/meteo/score/__init__.py index d389998..a233d3b 100644 --- a/src/earthkit/meteo/score/__init__.py +++ b/src/earthkit/meteo/score/__init__.py @@ -15,5 +15,6 @@ planned to work with objects like *earthkit.data FieldLists* or *xarray DataSets*. """ +from .bootstrap import * # noqa from .correlation import * # noqa from .crps import * # noqa diff --git a/src/earthkit/meteo/score/array/__init__.py b/src/earthkit/meteo/score/array/__init__.py index e231eef..a1a1d60 100644 --- a/src/earthkit/meteo/score/array/__init__.py +++ b/src/earthkit/meteo/score/array/__init__.py @@ -11,5 +11,6 @@ Verification functions operating on numpy arrays. """ +from .bootstrap import * # noqa from .correlation import * # noqa from .crps import * # noqa diff --git a/src/earthkit/meteo/score/array/bootstrap.py b/src/earthkit/meteo/score/array/bootstrap.py new file mode 100644 index 0000000..b71d3d0 --- /dev/null +++ b/src/earthkit/meteo/score/array/bootstrap.py @@ -0,0 +1,137 @@ +import random + +from earthkit.utils.array import array_namespace + + +def iter_samples(x, *args, sample_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): + """Iterate over resampled arrays for bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + + Yields + ------ + tuple + Resampled arrays (one for each iteration) + """ + args = (x,) + args + n_arrays = len(args) + if isinstance(sample_axis, int): + sample_axis = [sample_axis for _ in range(n_arrays)] + else: + assert len(sample_axis) == n_arrays, "sample_axis must have one element per input array" + xp = array_namespace(*args) + arrays = tuple((xp.asarray(arr), axis) for arr, axis in zip(args, sample_axis)) + n_inputs = x.shape[sample_axis[0]] + assert all( + y.shape[axis] == n_inputs for y, axis in arrays + ), "Input arrays must have the same size along the sampling axis" + if n_samples is None: + n_samples = n_inputs + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + sampled = tuple(xp.take(y, indices=indices, axis=axis) for y, axis in arrays) + yield sampled + + +def resample(x, *args, sample_axis=0, out_axis=0, n_iter=100, n_samples=None, randrange=random.randrange): + """Resample arrays for bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + out_axis: int + Stack samples along this axis + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + + Returns + ------ + tuple + Resampled arrays (one for each iteration) + """ + xp = array_namespace(x, *args) + n_arrays = len(args) + 1 + samples = [[] for _ in range(n_arrays)] + samples_it = iter_samples( + x, *args, sample_axis=sample_axis, n_iter=n_iter, n_samples=n_samples, randrange=randrange + ) + for sample in samples_it: + for i in range(n_arrays): + samples[i].append(sample[i]) + return tuple(xp.stack(sampled_arr, axis=out_axis) for sampled_arr in samples) + + +def bootstrap( + func, + x, + *args, + sample_axis=0, + out_axis=0, + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, +): + """Run bootstrapping + + Parameters + ---------- + func: function ((array, ..., **kwargs) -> array) + Function to bootstrap + x, *args: array-like + Inputs to ``function``, sampled for bootstrapping. Must have the same + size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + out_axis: int + Stack samples along this axis + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + **kwargs + Additional keyword arguments to ``func`` + + Returns + ------- + array-like + Aggregated results of the bootstrapping process + """ + xp = array_namespace(x, *args) + samples = iter_samples( + x, + *args, + sample_axis=sample_axis, + n_iter=n_iter, + n_samples=n_samples, + randrange=randrange, + ) + return xp.stack([func(*sampled, **kwargs) for sampled in samples], axis=out_axis) diff --git a/src/earthkit/meteo/score/bootstrap.py b/src/earthkit/meteo/score/bootstrap.py new file mode 100644 index 0000000..cd64c89 --- /dev/null +++ b/src/earthkit/meteo/score/bootstrap.py @@ -0,0 +1,179 @@ +import functools +import random +from dataclasses import dataclass + +from earthkit.utils.array import array_namespace + +try: + import xarray as xr +except ModuleNotFoundError: + xr = None + +from . import array + + +def resample(x, *args, **kwargs): + if xr is not None and isinstance(x, xr.DataArray): + n_arrays = len(args) + 1 + dim = kwargs.get("dim", None) + if dim is None: + raise TypeError("resample with xarray arguments requires 'dim'") + in_dims = [[dim] for _ in range(n_arrays)] + sample_dim = kwargs.get("sample_dim", "sample") + out_dims = [[sample_dim] for _ in range(n_arrays)] + return xr.apply_ufunc( + functools.partial(array.resample, sample_axis=-1, out_axis=-1, **kwargs), + x, + *args, + input_core_dims=in_dims, + output_core_dims=out_dims, + ) + + return array.resample(x, *args, **kwargs) + + +def _bootstrap_xarray( + func, + *args, + dim=None, + sample_dim="sample", + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, +): + assert xr is not None + if dim is None: + raise TypeError("bootstrap with xarray arguments requires 'dim'") + n_inputs = args[0].sizes[dim] + assert all( + arr.sizes[dim] == n_inputs for arr in args + ), "Input arrays must have the same size along the sampling axis" + if n_samples is None: + n_samples = n_inputs + results = [] + for _ in range(n_iter): + indices = [randrange(n_inputs) for _ in range(n_samples)] + sampled = tuple(arr.isel({dim: indices}) for arr in args) + results.append(func(*sampled, **kwargs)) + return xr.concat(results, sample_dim) + + +def bootstrap(func, x, *args, **kwargs): + if xr is not None and isinstance(x, xr.DataArray): + return _bootstrap_xarray(func, x, *args, **kwargs) + return array.bootstrap(func, x, *args, **kwargs) + + +@dataclass +class BootstrapResult: + """Container for the result of a bootstrapping process""" + + #: number of bootstrapping iterations + n_iter: int + + #: number of samples for each iteration + n_samples: int + + #: return values for each iteration, first dimension is the iteration + results: object + + def quantiles(self, threshold: float): + """Compute quantiles associated with the given threshold + + The first dimension of the returned array has length 2 corresponding to + the low (``threshold``) and high (``1 - threshold``) quantiles + """ + return self._xp.quantile(self.results, [threshold, 1.0 - threshold], axis=0) + + def sig_mask(self, threshold: float, sig_value: float): + """Compute the significance mask for the given threshold and significant value""" + low, high = self.quantiles(threshold) + return self._xp.logical_or(high < sig_value, low > sig_value) + + @property + def _xp(self): + return array_namespace(self.results) + + +class Bootstrappable: + def __init__(self, func): + self.func = func + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def bootstrap( + self, + x, + *args, + sample_axis=0, + n_iter=100, + n_samples=None, + randrange=random.randrange, + **kwargs, + ): + """Run bootstrapping + + Parameters + ---------- + x, *args: array-like + Inputs to the wrapped function, sampled for bootstrapping. Must have + the same size along ``sample_axis`` + sample_axis: int or list of int + Sample along this axis (either same for all or one per argument) + n_iter: int + Number of bootstrapping iterations + n_samples: int or None + Number of samples for each iteration. If None, use the number of + inputs (size of ``x`` along the sampling axis) + randrange: function (int -> int) + Random generator for integers: `randrange(n)` should return an + integer in `range(n)` + **kwargs + Additional keyword arguments to the wrapped function + + Returns + ------- + BootstrapResult + Aggregated results of the bootstrapping process + """ + results = bootstrap( + self.func, + x, + *args, + sample_axis=sample_axis, + n_iter=n_iter, + n_samples=n_samples, + randrange=randrange, + ) + return BootstrapResult(n_iter, n_samples, results) + + +def enable_bootstrap(func): + """Enable bootstrapping on a binary scoring function + + The function will be wrapped in a callable object (see + :class:`Bootstrappable`). Calling the object will call the function + directly. The object will have a :meth:`Bootstrappable.bootstrap` method to + run bootstrapping. + + Examples + -------- + + :: + @enable_bootstrap + def mse(x, y, axis=-1): + return np.mean(np.square(y - x), axis=axis) + + x = ... + y = ... + mse(x, y) # normal call, return MSE + bresult = mse.bootstrap(x, y) # bootstrapping + bresult.quantiles(0.2) + bresult.sig_mask(0.05, 0.0) + """ + + wrapper = Bootstrappable(func) + functools.update_wrapper(wrapper, func) + return wrapper