diff --git a/model/common/src/icon4py/model/common/decomposition/definitions.py b/model/common/src/icon4py/model/common/decomposition/definitions.py
index c23f1ad111..e02cb5a1a7 100644
--- a/model/common/src/icon4py/model/common/decomposition/definitions.py
+++ b/model/common/src/icon4py/model/common/decomposition/definitions.py
@@ -13,6 +13,7 @@
 import logging
 from collections.abc import Sequence
 from enum import Enum
+from types import ModuleType
 from typing import Any, Literal, Protocol, overload, runtime_checkable
 
 import dace  # type: ignore[import-untyped]
@@ -21,6 +22,7 @@
 
 from icon4py.model.common import utils
 from icon4py.model.common.orchestration.halo_exchange import DummyNestedSDFG
+from icon4py.model.common.states import utils as state_utils
 from icon4py.model.common.utils import data_allocation as data_alloc
 
 
@@ -323,6 +325,38 @@ class SingleNodeRun(RunType):
     pass
 
 
+class Reductions(Protocol):
+    def min(
+        self, buffer: data_alloc.NDArray, array_ns: ModuleType = np
+    ) -> state_utils.ScalarType: ...
+
+    def max(
+        self, buffer: data_alloc.NDArray, array_ns: ModuleType = np
+    ) -> state_utils.ScalarType: ...
+
+    def sum(
+        self, buffer: data_alloc.NDArray, array_ns: ModuleType = np
+    ) -> state_utils.ScalarType: ...
+
+    def mean(
+        self, buffer: data_alloc.NDArray, array_ns: ModuleType = np
+    ) -> state_utils.ScalarType: ...
+
+
+class SingleNodeReductions(Reductions):
+    def min(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        return array_ns.min(buffer).item()
+
+    def max(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        return array_ns.max(buffer).item()
+
+    def sum(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        return array_ns.sum(buffer).item()
+
+    def mean(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        return array_ns.sum(buffer).item() / buffer.size
+
+
 @overload
 def get_runtype(with_mpi: Literal[True]) -> MultiNodeRun: ...
 
@@ -365,4 +399,20 @@ def create_single_node_exchange(
     return SingleNodeExchange()
 
 
+@functools.singledispatch
+def create_reduction(props: ProcessProperties) -> Reductions:
+    """
+    Create a Global Reduction depending on the runtime size.
+
+    Depending on the number of processor a SingleNode version is returned or a GHEX context created and a Multinode returned.
+    """
+    raise NotImplementedError(f"Unknown ProcessorProperties type ({type(props)})")
+
+
+@create_reduction.register(SingleNodeProcessProperties)
+def create_single_reduction_exchange(props: SingleNodeProcessProperties) -> Reductions:
+    return SingleNodeReductions()
+
+
 single_node_default = SingleNodeExchange()
+single_node_reductions = SingleNodeReductions()
diff --git a/model/common/src/icon4py/model/common/decomposition/mpi_decomposition.py b/model/common/src/icon4py/model/common/decomposition/mpi_decomposition.py
index cd18d6259d..aed61639c2 100644
--- a/model/common/src/icon4py/model/common/decomposition/mpi_decomposition.py
+++ b/model/common/src/icon4py/model/common/decomposition/mpi_decomposition.py
@@ -8,10 +8,12 @@
 
 from __future__ import annotations
 
+import dataclasses
 import functools
 import logging
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass
+from types import ModuleType
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Union
 
 import dace  # type: ignore[import-untyped]
@@ -20,8 +22,9 @@
 
 from icon4py.model.common import dimension as dims
 from icon4py.model.common.decomposition import definitions
-from icon4py.model.common.decomposition.definitions import SingleNodeExchange
+from icon4py.model.common.decomposition.definitions import Reductions, SingleNodeExchange
 from icon4py.model.common.orchestration import halo_exchange
+from icon4py.model.common.states import utils as state_utils
 from icon4py.model.common.utils import data_allocation as data_alloc
 
 
@@ -423,3 +426,81 @@ def create_multinode_node_exchange(
         return GHexMultiNodeExchange(props, decomp_info)
     else:
         return SingleNodeExchange()
+
+
+@dataclasses.dataclass
+class GlobalReductions(Reductions):
+    props: definitions.ProcessProperties
+
+    def _reduce(
+        self,
+        buffer: data_alloc.NDArray,
+        local_reduction: Callable[[data_alloc.NDArray], data_alloc.ScalarT],
+        global_reduction: mpi4py.MPI.Op,
+        array_ns: ModuleType = np,
+    ) -> state_utils.ScalarType:
+        local_red_val = local_reduction(buffer)
+        recv_buffer = array_ns.empty(1, dtype=buffer.dtype)
+        if hasattr(
+            array_ns, "cuda"
+        ):  # https://mpi4py.readthedocs.io/en/stable/tutorial.html#gpu-aware-mpi-python-gpu-arrays
+            array_ns.cuda.runtime.deviceSynchronize()
+        self.props.comm.Allreduce(local_red_val, recv_buffer, global_reduction)
+        return recv_buffer.item()
+
+    def _calc_buffer_size(
+        self,
+        buffer: data_alloc.NDArray,
+        array_ns: ModuleType = np,
+    ) -> state_utils.ScalarType:
+        return self._reduce(array_ns.asarray([buffer.size]), array_ns.sum, mpi4py.MPI.SUM, array_ns)
+
+    def min(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        if self._calc_buffer_size(buffer, array_ns) == 0:
+            raise ValueError("global_min requires a non-empty buffer")
+        return self._reduce(
+            buffer if buffer.size != 0 else array_ns.asarray([array_ns.inf]),
+            array_ns.min,
+            mpi4py.MPI.MIN,
+            array_ns,
+        )
+
+    def max(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        if self._calc_buffer_size(buffer, array_ns) == 0:
+            raise ValueError("global_max requires a non-empty buffer")
+        return self._reduce(
+            buffer if buffer.size != 0 else array_ns.asarray([-array_ns.inf]),
+            array_ns.max,
+            mpi4py.MPI.MAX,
+            array_ns,
+        )
+
+    def sum(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        if self._calc_buffer_size(buffer, array_ns) == 0:
+            raise ValueError("global_sum requires a non-empty buffer")
+        return self._reduce(
+            buffer if buffer.size != 0 else array_ns.asarray([0]),
+            array_ns.sum,
+            mpi4py.MPI.SUM,
+            array_ns,
+        )
+
+    def mean(self, buffer: data_alloc.NDArray, array_ns: ModuleType = np) -> state_utils.ScalarType:
+        global_buffer_size = self._calc_buffer_size(buffer, array_ns)
+        if global_buffer_size == 0:
+            raise ValueError("global_mean requires a non-empty buffer")
+
+        return (
+            self._reduce(
+                buffer if buffer.size != 0 else array_ns.asarray([0]),
+                array_ns.sum,
+                mpi4py.MPI.SUM,
+                array_ns,
+            )
+            / global_buffer_size
+        )
+
+
+@definitions.create_reduction.register(MPICommProcessProperties)
+def create_global_reduction(props: MPICommProcessProperties) -> Reductions:
+    return GlobalReductions(props)
diff --git a/model/common/src/icon4py/model/common/grid/geometry.py b/model/common/src/icon4py/model/common/grid/geometry.py
index 76ac473e9e..bf5e78eda0 100644
--- a/model/common/src/icon4py/model/common/grid/geometry.py
+++ b/model/common/src/icon4py/model/common/grid/geometry.py
@@ -189,7 +189,7 @@ def _inverse_field_provider(self, field_name: str) -> factory.FieldProvider:
                     self._edge_domain(h_grid.Zone.LOCAL),
                 )
             },
-            do_exchange=False,
+            do_exchange=True,
         )
         return provider
 
diff --git a/model/common/src/icon4py/model/common/grid/grid_manager.py b/model/common/src/icon4py/model/common/grid/grid_manager.py
index 9779614499..d5f4a82cf7 100644
--- a/model/common/src/icon4py/model/common/grid/grid_manager.py
+++ b/model/common/src/icon4py/model/common/grid/grid_manager.py
@@ -8,6 +8,7 @@
 import functools
 import logging
 import pathlib
+from collections.abc import Callable
 from types import ModuleType
 from typing import Literal, Protocol, TypeAlias
 
@@ -87,6 +88,7 @@ def __init__(
         transformation: IndexTransformation,
         grid_file: pathlib.Path | str,
         config: v_grid.VerticalGridConfig,  # TODO(halungge): remove to separate vertical and horizontal grid
+        global_reductions: decomposition.Reductions = decomposition.single_node_reductions,
     ):
         self._transformation = transformation
         self._file_name = str(grid_file)
@@ -96,6 +98,7 @@ def __init__(
         self._geometry: GeometryDict = {}
         self._reader = None
         self._coordinates: CoordinateDict = {}
+        self._global_reductions = global_reductions
 
     def open(self):
         """Open the gridfile resource for reading."""
@@ -130,7 +133,10 @@ def __call__(self, allocator: gtx_typing.FieldBufferAllocationUtil, keep_skip_va
 
         self._geometry = self._read_geometry_fields(allocator)
         self._grid = self._construct_grid(
-            allocator=allocator, with_skip_values=keep_skip_values, geometry_type=geometry_type
+            allocator=allocator,
+            with_skip_values=keep_skip_values,
+            geometry_type=geometry_type,
+            mean_reduction=self._global_reductions.mean,
         )
         self._coordinates = self._read_coordinates(allocator, geometry_type)
         self.close()
@@ -353,6 +359,9 @@ def _construct_grid(
         allocator: gtx_typing.FieldBufferAllocationUtil | None,
         with_skip_values: bool,
         geometry_type: base.GeometryType,
+        mean_reduction: Callable[
+            [data_alloc.NDArray, ModuleType], data_alloc.ScalarT
+        ] = decomposition.single_node_reductions.mean,
     ) -> icon.IconGrid:
         """Construct the grid topology from the icon grid file.
 
@@ -386,7 +395,6 @@ def _construct_grid(
         mean_dual_cell_area = self._reader.try_attribute(
             gridfile.MPIMPropertyName.MEAN_DUAL_CELL_AREA
         )
-
         edge_lengths = self.geometry_fields[gridfile.GeometryName.EDGE_LENGTH.value].ndarray
         dual_edge_lengths = self.geometry_fields[
             gridfile.GeometryName.DUAL_EDGE_LENGTH.value
@@ -412,6 +420,7 @@ def _construct_grid(
             dual_edge_lengths=dual_edge_lengths,
             cell_areas=cell_areas,
             dual_cell_areas=dual_cell_areas,
+            mean_reduction=mean_reduction,
         )
         grid_size = base.HorizontalGridSize(
             num_vertices=num_vertices, num_edges=num_edges, num_cells=num_cells
diff --git a/model/common/src/icon4py/model/common/grid/icon.py b/model/common/src/icon4py/model/common/grid/icon.py
index f8588c4260..8bd8c13448 100644
--- a/model/common/src/icon4py/model/common/grid/icon.py
+++ b/model/common/src/icon4py/model/common/grid/icon.py
@@ -16,6 +16,7 @@
 from gt4py.next import allocators as gtx_allocators
 
 from icon4py.model.common import constants, dimension as dims
+from icon4py.model.common.decomposition import definitions as decomposition
 from icon4py.model.common.grid import base, horizontal as h_grid
 from icon4py.model.common.utils import data_allocation as data_alloc
 
@@ -97,13 +98,16 @@ def from_fields(
         cell_areas: data_alloc.NDArray | None = None,
         mean_dual_cell_area: float | None = None,
         dual_cell_areas: data_alloc.NDArray | None = None,
+        mean_reduction: Callable[
+            [data_alloc.NDArray, ModuleType], data_alloc.ScalarT
+        ] = decomposition.single_node_reductions.mean,
         **kwargs,
     ) -> _T:
         def init_mean(value: float | None, data: data_alloc.NDArray | None) -> float | None:
             if value is not None:
                 return value
             if data is not None:
-                return array_ns.mean(data).item()
+                return mean_reduction(data, array_ns=array_ns)
             return None
 
         mean_edge_length = init_mean(mean_edge_length, edge_lengths)
diff --git a/model/common/src/icon4py/model/common/metrics/compute_coeff_gradekin.py b/model/common/src/icon4py/model/common/metrics/compute_coeff_gradekin.py
index 44a594ae3b..8c9535d535 100644
--- a/model/common/src/icon4py/model/common/metrics/compute_coeff_gradekin.py
+++ b/model/common/src/icon4py/model/common/metrics/compute_coeff_gradekin.py
@@ -5,6 +5,7 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
+from collections.abc import Callable
 from types import ModuleType
 
 import gt4py.next as gtx
@@ -17,6 +18,7 @@ def compute_coeff_gradekin(
     edge_cell_length: data_alloc.NDArray,
     inv_dual_edge_length: data_alloc.NDArray,
     horizontal_start: gtx.int32,
+    exchange: Callable[[data_alloc.NDArray], None],
     array_ns: ModuleType = np,
 ) -> data_alloc.NDArray:
     """
@@ -41,4 +43,5 @@ def compute_coeff_gradekin(
         * inv_dual_edge_length[horizontal_start:]
     )
     coeff_gradekin_full = array_ns.column_stack((coeff_gradekin_0, coeff_gradekin_1))
+    exchange(coeff_gradekin_full)
     return coeff_gradekin_full
diff --git a/model/common/src/icon4py/model/common/metrics/metric_fields.py b/model/common/src/icon4py/model/common/metrics/metric_fields.py
index af2a415b26..08fecd8681 100644
--- a/model/common/src/icon4py/model/common/metrics/metric_fields.py
+++ b/model/common/src/icon4py/model/common/metrics/metric_fields.py
@@ -32,6 +32,7 @@
 from gt4py.next.experimental import concat_where
 
 from icon4py.model.common import constants, dimension as dims, field_type_aliases as fa
+from icon4py.model.common.decomposition import definitions as decomposition
 from icon4py.model.common.dimension import C2E, C2E2C, C2E2CO, E2C, C2E2CODim, Koff
 from icon4py.model.common.interpolation.stencils.cell_2_edge_interpolation import (
     _cell_2_edge_interpolation,
@@ -596,6 +597,9 @@ def compute_nflat_gradp(
     e_owner_mask: data_alloc.NDArray,
     lateral_boundary_level: int,
     nlev: int,
+    min_reduction: Callable[
+        [data_alloc.NDArray, ModuleType], data_alloc.ScalarT
+    ] = decomposition.single_node_reductions.min,
     array_ns: ModuleType = np,
 ) -> int:
     """
@@ -607,8 +611,8 @@ def compute_nflat_gradp(
         flat_idx_max,
         nlev,
     )
-    nflat_gradp = array_ns.min(mask_array)
-    return nflat_gradp.item()
+    nflat_gradp = min_reduction(mask_array, array_ns=array_ns)
+    return nflat_gradp
 
 
 @gtx.field_operator
diff --git a/model/common/src/icon4py/model/common/metrics/metrics_factory.py b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
index 9d7e9a7a01..16a3ae2440 100644
--- a/model/common/src/icon4py/model/common/metrics/metrics_factory.py
+++ b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
@@ -69,6 +69,7 @@ def __init__(
         exner_expol: float,
         vwind_offctr: float,
         exchange: decomposition.ExchangeRuntime = decomposition.single_node_default,
+        global_reductions: decomposition.Reductions = decomposition.single_node_reductions,
     ):
         self._backend = backend
         self._xp = data_alloc.import_array_ns(backend)
@@ -81,6 +82,7 @@ def __init__(
         self._geometry = geometry_source
         self._exchange = exchange
         self._interpolation_source = interpolation_source
+        self._global_reductions = global_reductions
         log.info(
             f"initialized metrics factory for backend = '{self._backend_name()}' and grid = '{self._grid}'"
         )
@@ -657,7 +659,11 @@ def _register_computed_fields(self) -> None:  # noqa: PLR0915 [too-many-statemen
         self.register_provider(max_flat_index_provider)
 
         nflat_gradp_provider = factory.NumpyDataProvider(
-            func=functools.partial(mf.compute_nflat_gradp, array_ns=self._xp),
+            func=functools.partial(
+                mf.compute_nflat_gradp,
+                array_ns=self._xp,
+                min_reduction=self._global_reductions.min,
+            ),
             domain=(),
             deps={
                 "flat_idx_max": attrs.FLAT_IDX_MAX,
@@ -779,6 +785,7 @@ def _register_computed_fields(self) -> None:  # noqa: PLR0915 [too-many-statemen
             func=functools.partial(
                 compute_coeff_gradekin.compute_coeff_gradekin,
                 array_ns=self._xp,
+                exchange=functools.partial(self._exchange.exchange_and_wait, dims.EdgeDim),
             ),
             domain=(dims.EdgeDim, dims.E2CDim),
             fields=(attrs.COEFF_GRADEKIN,),
diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
index b7a695ce82..93a1fab38a 100644
--- a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
+++ b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
@@ -25,13 +25,14 @@
 import gt4py.next as gtx
 
 import icon4py.model.testing.test_utils as test_helpers
-from icon4py.model.common import dimension as dims
+from icon4py.model.common import dimension as dims, model_backends
 from icon4py.model.common.decomposition import definitions, mpi_decomposition
 from icon4py.model.testing import definitions as test_defs, serialbox
 from icon4py.model.testing.parallel_helpers import check_comm_size, processor_props
 
 from ...fixtures import (
     backend,
+    backend_like,
     data_provider,
     decomposition_info,
     download_ser_data,
@@ -320,7 +321,108 @@ def test_halo_exchange_for_sparse_field(
     print(
         f"{processor_props.rank}/{processor_props.comm_size}: size of computed field {result.asnumpy().shape}"
     )
-
     exchange.exchange_and_wait(dims.CellDim, result)
 
     assert test_helpers.dallclose(result.asnumpy(), field_ref.asnumpy())
+
+
+inputs_ls = [[2.0, 2.0, 4.0, 1.0], [2.0, 1.0], [30.0], []]
+
+
+@pytest.mark.parametrize("global_list", inputs_ls)
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+def test_global_reductions_min(
+    processor_props: definitions.ProcessProperties,
+    backend_like: model_backends.BackendLike,
+    global_list: list[float],
+) -> None:
+    my_rank = processor_props.rank
+    xp = data_alloc.import_array_ns(model_backends.get_allocator(backend_like))
+    comm_size = processor_props.comm_size
+    chunks = np.array_split(global_list, comm_size)
+    local_data = xp.array(chunks[my_rank])
+
+    global_reduc = definitions.create_reduction(processor_props)
+
+    if len(global_list) > 0:
+        min_val = global_reduc.min(local_data, array_ns=xp)
+        expected_val = np.min(global_list)
+        assert expected_val == min_val
+    else:
+        with pytest.raises(ValueError, match="global_min requires a non-empty buffer"):
+            global_reduc.min(local_data, array_ns=xp)
+
+
+@pytest.mark.parametrize("global_list", inputs_ls)
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+def test_global_reductions_max(
+    processor_props: definitions.ProcessProperties,
+    backend_like: model_backends.BackendLike,
+    global_list: list[float],
+) -> None:
+    my_rank = processor_props.rank
+    xp = data_alloc.import_array_ns(model_backends.get_allocator(backend_like))
+    comm_size = processor_props.comm_size
+    chunks = np.array_split(global_list, comm_size)
+    local_data = xp.array(chunks[my_rank])
+
+    global_reduc = definitions.create_reduction(processor_props)
+
+    if len(global_list) > 0:
+        max_val = global_reduc.max(local_data, array_ns=xp)
+        expected_val = np.max(global_list)
+        assert expected_val == max_val
+    else:
+        with pytest.raises(ValueError, match="global_max requires a non-empty buffer"):
+            global_reduc.max(local_data, array_ns=xp)
+
+
+@pytest.mark.parametrize("global_list", inputs_ls)
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+def test_global_reductions_sum(
+    processor_props: definitions.ProcessProperties,
+    backend_like: model_backends.BackendLike,
+    global_list: list[float],
+) -> None:
+    my_rank = processor_props.rank
+    xp = data_alloc.import_array_ns(model_backends.get_allocator(backend_like))
+    comm_size = processor_props.comm_size
+    chunks = np.array_split(global_list, comm_size)
+    local_data = xp.array(chunks[my_rank])
+
+    global_reduc = definitions.create_reduction(processor_props)
+
+    if len(global_list) > 0:
+        sum_val = global_reduc.sum(local_data, array_ns=xp)
+        expected_val = np.sum(global_list)
+        assert expected_val == sum_val
+    else:
+        with pytest.raises(ValueError, match="global_sum requires a non-empty buffer"):
+            global_reduc.sum(local_data, array_ns=xp)
+
+
+@pytest.mark.parametrize("global_list", inputs_ls)
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+def test_global_reductions_mean(
+    processor_props: definitions.ProcessProperties,
+    backend_like: model_backends.BackendLike,
+    global_list: list[float],
+) -> None:
+    my_rank = processor_props.rank
+    xp = data_alloc.import_array_ns(model_backends.get_allocator(backend_like))
+    comm_size = processor_props.comm_size
+    chunks = np.array_split(global_list, comm_size)
+    local_data = xp.array(chunks[my_rank])
+    global_reduc = definitions.create_reduction(processor_props)
+
+    if len(global_list) > 0:
+        mean_val = global_reduc.mean(local_data, array_ns=xp)
+        expected_val = np.mean(global_list)
+        assert expected_val == mean_val
+    else:
+        with pytest.raises(ValueError, match="global_mean requires a non-empty buffer"):
+            global_reduc.mean(local_data, array_ns=xp)
diff --git a/model/common/tests/common/fixtures.py b/model/common/tests/common/fixtures.py
index 91dba29f83..10669413e6 100644
--- a/model/common/tests/common/fixtures.py
+++ b/model/common/tests/common/fixtures.py
@@ -135,6 +135,7 @@ def metrics_factory_from_savepoint(
     interpolation_factory_from_savepoint: interpolation_factory.InterpolationFieldsFactory,
 ) -> Generator[metrics_factory.MetricsFieldsFactory]:
     exchange = decomposition.create_exchange(processor_props, decomposition_info)
+    global_reductions = decomposition.create_reduction(processor_props)
     geometry_source = geometry_from_savepoint
     interpolation_field_source = interpolation_factory_from_savepoint
     topography = topography_savepoint.topo_c()
@@ -172,6 +173,7 @@ def metrics_factory_from_savepoint(
         exner_expol=exner_expol,
         vwind_offctr=vwind_offctr,
         exchange=exchange,
+        global_reductions=global_reductions,
     )
 
     yield factory
diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_geometry.py b/model/common/tests/common/grid/mpi_tests/test_parallel_geometry.py
index 313c44c11f..3e45a9ab40 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_geometry.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_geometry.py
@@ -8,7 +8,6 @@
 
 from __future__ import annotations
 
-import typing
 from typing import TYPE_CHECKING
 
 import gt4py.next as gtx
diff --git a/model/common/tests/common/metrics/mpi_tests/test_parallel_metrics.py b/model/common/tests/common/metrics/mpi_tests/test_parallel_metrics.py
index fca8ef6dd7..144c84698d 100644
--- a/model/common/tests/common/metrics/mpi_tests/test_parallel_metrics.py
+++ b/model/common/tests/common/metrics/mpi_tests/test_parallel_metrics.py
@@ -183,3 +183,24 @@ def test_distributed_metrics_wgtfacq_e(
     field = factory.get(attrs.WGTFACQ_E).asnumpy()
     field_ref = metrics_savepoint.wgtfacq_e_dsl(field.shape[1]).asnumpy()
     assert test_utils.dallclose(field, field_ref)
+
+
+@pytest.mark.datatest
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+def test_distributed_metrics_nflat_gradp(
+    backend: gtx_typing.Backend,
+    grid_savepoint: sb.IconGridSavepoint,
+    processor_props: decomposition.ProcessProperties,
+    decomposition_info: decomposition.DecompositionInfo,
+    metrics_factory_from_savepoint: metrics_factory.MetricsFieldsFactory,
+    experiment: test_defs.Experiment,
+) -> None:
+    parallel_helpers.check_comm_size(processor_props)
+    parallel_helpers.log_process_properties(processor_props)
+    parallel_helpers.log_local_field_size(decomposition_info)
+    factory = metrics_factory_from_savepoint
+
+    value = factory.get(attrs.NFLAT_GRADP)
+    value_ref = grid_savepoint.nflat_gradp()
+    assert value == value_ref
diff --git a/model/common/tests/common/metrics/unit_tests/test_compute_coeff_gradekin.py b/model/common/tests/common/metrics/unit_tests/test_compute_coeff_gradekin.py
index 271d21425d..11ff6848c3 100644
--- a/model/common/tests/common/metrics/unit_tests/test_compute_coeff_gradekin.py
+++ b/model/common/tests/common/metrics/unit_tests/test_compute_coeff_gradekin.py
@@ -27,6 +27,8 @@
     ranked_data_path,
 )
 
+from ... import utils
+
 
 if TYPE_CHECKING:
     from icon4py.model.common.grid import base as base_grid
@@ -51,5 +53,6 @@ def test_compute_coeff_gradekin(
         edge_cell_length,
         inv_dual_edge_length,
         horizontal_start,
+        exchange=utils.dummy_exchange,
     )
     assert test_utils.dallclose(coeff_gradekin_ref.asnumpy(), coeff_gradekin_full)
diff --git a/model/standalone_driver/src/icon4py/model/standalone_driver/driver_utils.py b/model/standalone_driver/src/icon4py/model/standalone_driver/driver_utils.py
index 0ccacd5a4e..6bdf2216c8 100644
--- a/model/standalone_driver/src/icon4py/model/standalone_driver/driver_utils.py
+++ b/model/standalone_driver/src/icon4py/model/standalone_driver/driver_utils.py
@@ -61,11 +61,10 @@ def create_grid_manager(
     grid_file_path: pathlib.Path,
     vertical_grid_config: v_grid.VerticalGridConfig,
     allocator: gtx_typing.FieldBufferAllocationUtil,
+    global_reductions: decomposition_defs.Reductions = decomposition_defs.single_node_reductions,
 ) -> gm.GridManager:
     grid_manager = gm.GridManager(
-        gm.ToZeroBasedIndexTransformation(),
-        grid_file_path,
-        vertical_grid_config,
+        gm.ToZeroBasedIndexTransformation(), grid_file_path, vertical_grid_config, global_reductions
     )
     grid_manager(allocator=allocator, keep_skip_values=True)
 
diff --git a/model/standalone_driver/src/icon4py/model/standalone_driver/standalone_driver.py b/model/standalone_driver/src/icon4py/model/standalone_driver/standalone_driver.py
index bd554aa141..185e99c0a7 100644
--- a/model/standalone_driver/src/icon4py/model/standalone_driver/standalone_driver.py
+++ b/model/standalone_driver/src/icon4py/model/standalone_driver/standalone_driver.py
@@ -552,6 +552,7 @@ def initialize_driver(
     )
 
     configuration_file_path = pathlib.Path(configuration_file_path)
+    global_reductions = decomposition_defs.create_global_reduction(parallel_props)
     grid_file_path = pathlib.Path(grid_file_path)
     if pathlib.Path(output_path).exists():
         current_time = datetime.datetime.now()
@@ -578,6 +579,7 @@ def initialize_driver(
         grid_file_path=grid_file_path,
         vertical_grid_config=vertical_grid_config,
         allocator=allocator,
+        global_reductions=global_reductions,
     )
 
     log.info("creating the decomposition info")
diff --git a/model/testing/src/icon4py/model/testing/definitions.py b/model/testing/src/icon4py/model/testing/definitions.py
index 3a39edab6e..e907f96b3f 100644
--- a/model/testing/src/icon4py/model/testing/definitions.py
+++ b/model/testing/src/icon4py/model/testing/definitions.py
@@ -153,8 +153,8 @@ class Experiments:
         num_levels=60,
         partitioned_data={
             1: "https://polybox.ethz.ch/index.php/s/2n2WpTgZFlTCTHu/download",
-            2: "https://polybox.ethz.ch/index.php/s/GQNcLtp4CN7ERbi/download",
-            4: "https://polybox.ethz.ch/index.php/s/XCcE34Ry5EQckoK/download",
+            2: "https://polybox.ethz.ch/index.php/s/nTBgWgzfSBMn2zM/download",
+            4: "https://polybox.ethz.ch/index.php/s/9iq8pW4AHY5mnfc/download",
         },
     )
     MCH_CH_R04B09: Final = Experiment(
@@ -164,8 +164,8 @@ class Experiments:
         num_levels=65,
         partitioned_data={
             1: "https://polybox.ethz.ch/index.php/s/f42nsmvgOoWZPzi/download",
-            2: "https://polybox.ethz.ch/index.php/s/P6F6ZbzWHI881dZ/download",
-            4: "https://polybox.ethz.ch/index.php/s/NfES3j9no15A0aX/download",
+            2: "https://polybox.ethz.ch/index.php/s/ZSwAoox8WnPSmYc/download",
+            4: "https://polybox.ethz.ch/index.php/s/y7AnTai3g5eSnsC/download",
         },
     )
     JW: Final = Experiment(
@@ -173,21 +173,33 @@ class Experiments:
         description="Jablonowski Williamson atmospheric test case",
         grid=Grids.R02B04_GLOBAL,
         num_levels=35,
-        partitioned_data={1: "https://polybox.ethz.ch/index.php/s/5W3Z2K6pyo0egzo/download"},
+        partitioned_data={
+            1: "https://polybox.ethz.ch/index.php/s/5W3Z2K6pyo0egzo/download",
+            2: "https://polybox.ethz.ch/index.php/s/caPLb5TfNCZsRN6/download",
+            4: "https://polybox.ethz.ch/index.php/s/pbxteJRfpzDBWYf/download",
+        },
     )
     GAUSS3D: Final = Experiment(
         name="gauss3d_torus",
         description="Gauss 3d test case",
         grid=Grids.TORUS_50000x5000,
         num_levels=35,
-        partitioned_data={1: "https://polybox.ethz.ch/index.php/s/ZuqDIREPVits9r0/download"},
+        partitioned_data={
+            1: "https://polybox.ethz.ch/index.php/s/ZuqDIREPVits9r0/download",
+            2: "https://polybox.ethz.ch/index.php/s/LoHe823TX5KNNGn/download",
+            4: "https://polybox.ethz.ch/index.php/s/zmW4wZ3btbGLFC7/download",
+        },
     )
     WEISMAN_KLEMP_TORUS: Final = Experiment(
         name="weisman_klemp_torus",
         description="Weisman-Klemp experiment on Torus Grid",
         grid=Grids.TORUS_50000x5000,
         num_levels=64,
-        partitioned_data={1: "https://polybox.ethz.ch/index.php/s/ByLnyii7MMRHJbK/download"},
+        partitioned_data={
+            1: "https://polybox.ethz.ch/index.php/s/ByLnyii7MMRHJbK/download",
+            2: "https://polybox.ethz.ch/index.php/s/dAq2BWe5scmj28D/download",
+            4: "https://polybox.ethz.ch/index.php/s/cw3g9KbTQZ4Ko74/download",
+        },
     )
 
 
diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
index 3058fc7210..28483172a1 100644
--- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py
+++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
@@ -164,6 +164,10 @@ def download_ser_data(
     if "not datatest" in request.config.getoption("-k", ""):
         return
 
+    with_mpi = request.config.getoption("with_mpi", False)
+    if with_mpi and experiment == definitions.Experiments.GAUSS3D:
+        # TODO(msimberg): Fix? Need serialized data.
+        pytest.skip("GAUSS3D experiment does not support MPI tests")
     _download_ser_data(processor_props.comm_size, ranked_data_path, experiment)