GridTools · philip-paul-mueller · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
@@ -762,33 +762,57 @@ def _gt_auto_configure_maps_and_strides(
     For a description of the arguments see the `gt_auto_optimize()` function.
     """
 
-    # We now set the iteration order of the Maps. For that we use `unit_strides_kind`
-    #  argument and if not supplied we guess depending if we are on the GPU or not.
-    if unit_strides_kind is None:
-        unit_strides_kind = (
-            gtx_common.DimensionKind.HORIZONTAL if gpu else gtx_common.DimensionKind.VERTICAL
+    # If `unit_strides_kind` is unknown we will not modify the Map order nor the
+    #  strides, except if we are on GPU. The reason for this is that the maximal
+    #  number of blocks is different for each dimension. If the largest dimension
+    #  is for example associated with the `z` dimension, we would get launch errors
+    #  at some point. Thus in that case we pretend that it is horizontal. Which is
+    #  a valid assumption for any ICON-like code or if the GT4Py allocator is used.
+    # TODO(phimuell): Make this selection more intelligent.
+    if unit_strides_kind is None and gpu:
+        prefered_direction_kind: Optional[gtx_common.DimensionKind] = (
+            gtx_common.DimensionKind.HORIZONTAL
+        )
+    else:
+        prefered_direction_kind = unit_strides_kind
+
+    # We should actually use a `gtx.Dimension` here and not a `gtx.DimensionKind`,
+    #  since they are unique. However at this stage, especially after the expansion
+    #  of non standard Memlets (which happens in the GPU transformation) associating
+    #  Map parameters with GT4Py dimension is very hard to impossible. At this stage
+    #  the kind is the most reliable indicator we have.
+    # NOTE: This is not the only location where we manipulate the Map order, we also
+    #   do it in the GPU transformation, where we have to set the order of the
+    #   expanded Memlets.
+    if prefered_direction_kind is not None:
+        gtx_transformations.gt_set_iteration_order(
+            sdfg=sdfg,
+            unit_strides_kind=prefered_direction_kind,
+            validate=False,
+            validate_all=validate_all,
         )
-    # It is not possible to use the `unit_strides_dim` argument of the
-    #  function, because `LoopBlocking`, if run, changed the name of the
-    #  parameter but the dimension can still be identified by its "kind".
-    gtx_transformations.gt_set_iteration_order(
-        sdfg=sdfg,
-        unit_strides_kind=unit_strides_kind,
-        validate=False,
-        validate_all=validate_all,
-    )
 
     # NOTE: We have to set the strides of transients before the non-standard Memlets
-    #   get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists,
-    #   which requires that the final strides are there. Furthermore, Memlet expansion
-    #   has to happen before the GPU block size is set. There are several possible
-    #   solutions for that, of which none is really good. The one that is the least
-    #   bad thing is to set the strides of the transients here. The main downside
-    #   is that this and the `_gt_auto_post_processing()` function has these weird
-    #   names.
-    gtx_transformations.gt_change_strides(sdfg, gpu=gpu)
+    #   get expanded, i.e. turned into Maps because no matching `cudaMemcpy*()` call
+    #   exists, which requires that the final strides are there. Furthermore, Memlet
+    #   expansion has to happen before the GPU block size is set. There are several
+    #   possible solutions for that, of which none is really good. The least bad one
+    #   is to set the strides of the transients here. The main downside is that we
+    #   slightly modify the SDFG in the GPU transformation after we have set the
+    #   strides.
+    if prefered_direction_kind is not None:
+        gtx_transformations.gt_change_strides(sdfg, prefered_direction_kind=prefered_direction_kind)
 
     if gpu:
+        if unit_strides_kind != gtx_common.DimensionKind.HORIZONTAL:
+            warnings.warn(
+                "The GT4Py DaCe GPU backend assumes that the leading dimension, i.e."
+                " where stride is 1, is of kind 'HORIZONTAL', however it was"
+                f" '{unit_strides_kind}'. Furthermore, it should be the last dimension."
+                " Other configurations might lead to suboptimal performance.",
+                stacklevel=2,
+            )
+
         # TODO(phimuell): The GPU function might modify the map iteration order.
         #   This is because how it is implemented (promotion and fusion). However,
         #   because of its current state, this should not happen, but we have to look

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py
@@ -241,6 +241,7 @@ def restrict_fusion_to_newly_created_maps_horizontal(
     if len(maps_to_modify) == 0:
         return sdfg
 
+    # NOTE: This inherently assumes a particular memory order, see `gt_change_strides()`.
     for me_to_modify in maps_to_modify:
         map_to_modify: dace_nodes.Map = me_to_modify.map
         map_to_modify.params = list(reversed(map_to_modify.params))

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py
@@ -12,6 +12,7 @@
 from dace import data as dace_data
 from dace.sdfg import nodes as dace_nodes
 
+from gt4py.next import common as gtx_common
 from gt4py.next.program_processors.runners.dace import (
     sdfg_args as gtx_dace_args,
     transformations as gtx_transformations,
@@ -33,39 +34,62 @@
 
 def gt_change_strides(
     sdfg: dace.SDFG,
-    gpu: bool,
+    prefered_direction_kind: gtx_common.DimensionKind,
 ) -> dace.SDFG:
     """Modifies the strides of transients.
 
     The function will analyse the access patterns and set the strides of
-    transients in the optimal way.
-    The function should run after all maps have been created.
+    transients in the optimal way. The function should run after _all_
+    Maps have been created.
+    After the adjustment of the strides they will be propagated into the nested
+    SDFGs, see `gt_propagate_strides_of()` for more.
 
-    After the strides have been adjusted the function will also propagate
-    the strides into nested SDFG, see `gt_propagate_strides_of()` for more.
     Args:
         sdfg: The SDFG to process.
-        gpu: If the SDFG is supposed to run on the GPU.
+        prefered_direction_kind: `DimensionKind` of the dimension with stride 1.
 
     Note:
-        Currently the function will not scan the access pattern. Instead it will
-        either use FORTRAN order for GPU or C order. This function needs to be called
-        for both CPU and GPU to handle strides of memlets inside nested SDFGs.
-
-    Todo:
-        - Implement the estimation correctly.
+        - This function should be run after `gt_set_iteration_order()` has been run.
+        - Currently the function will not scan the access pattern. Instead it relies
+            on the default behaviour of the lowering and how the GT4Py allocator works.
+        - The current implementation assumes that there is only one dimension of the
+            given kind.
     """
-    # TODO(phimeull): Implement this function correctly.
-
-    for nsdfg in sdfg.all_sdfgs_recursive():
-        _gt_change_strides_non_recursive_impl(nsdfg, gpu)
+    # TODO(phimeull): Implement this function correctly, such that it decides the
+    #   order based on the access pattern. Probably also merge it with
+    #   `gt_set_iteration_order()` as the two things are related.
+
+    # NOTE: This function builds inherently assumes the dimension order defined by
+    #   `gtx_common.order_dimensions()`, the default behaviour of the lowering,
+    #   partially how the  GT4Py allocator works and that there is only one dimension
+    #   of any kind (which is true for ICON4Py, but not in general, for example
+    #   in Cartesian grids). Its base assumption is that the ordering (Map parameters
+    #   and strides) generated by the lowering "out of the box" are in row major/C
+    #   order. Because of the GT4Py dimension order this is the right order for
+    #   `gtx_common.DimensionKind.VERTICAL`. If the primary direction kind is
+    #   `HORIZONTAL`, then according to the GT4Py dimension order column major/FORTRAN
+    #   order should be used. To get there we have to reverse the strides order, which
+    #   `_gt_change_strides_non_recursive_impl()` does. This is very brittle but at
+    #   this point the best thing we can do.
+
+    match prefered_direction_kind:
+        case gtx_common.DimensionKind.VERTICAL:
+            return  # Nothing to do in that case. Maybe run Memlet propagation here?
+
+        case gtx_common.DimensionKind.HORIZONTAL:
+            for nsdfg in sdfg.all_sdfgs_recursive():
+                _gt_change_strides_non_recursive_impl(nsdfg)
+
+        case _:
+            raise ValueError(
+                f"Encountered unknown `DimensionKind` value: {prefered_direction_kind}"
+            )
 
 
 def _gt_change_strides_non_recursive_impl(
     sdfg: dace.SDFG,
-    gpu: bool,
 ) -> None:
-    """Set optimal strides of all access nodes in the SDFG.
+    """Set "optimal" strides of all access nodes in the SDFG.
 
     The function will look for all top level access node, see `_gt_find_toplevel_data_accesses()`
     and set their strides such that the access is optimal, see Note. The function
@@ -74,14 +98,9 @@ def _gt_change_strides_non_recursive_impl(
     This function should never be called directly but always through `gt_change_strides()`!
 
     Note:
-        Currently the function just reverses the strides of the data descriptor
-        of transient access nodes it processes. Since DaCe generates `C` order by default
-        this lead to FORTRAN order, which is (for now) sufficient to optimize the memory
-        layout to GPU.
-
-    Todo:
-        Make this function more intelligent to analyse the access pattern and then
-        figuring out the best order.
+        This function has the same underlying assumption as they are outlined in
+        `gt_change_strides()`, see there from more informations about the underlying
+        assumptions and limitations.
     """
     # NOTE: We have to process all access nodes (transient and globals). If we are inside a
     #   NestedSDFG then they were handled before on the level above us.
@@ -103,7 +122,7 @@ def _gt_change_strides_non_recursive_impl(
         #  access nodes because the non-transients come from outside and have their
         #  own strides.
         # TODO(phimuell): Set the stride based on the actual access pattern.
-        if desc.transient and gpu:
+        if desc.transient:
             new_stride_order = list(range(ndim))
             desc.set_strides_from_layout(*new_stride_order)
 
@@ -124,7 +143,8 @@ def _gt_change_strides_non_recursive_impl(
             )
 
     # Now handle the views.
-    # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can handle views.
+    # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can
+    #   handle views. However, we should get to a point where we do not have views.
     _gt_modify_strides_of_views_non_recursive(sdfg)