GridTools · philip-paul-mueller · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
@@ -762,12 +762,12 @@ def _gt_auto_configure_maps_and_strides(
     For a description of the arguments see the `gt_auto_optimize()` function.
     """
 
-    # We now set the iteration order of the Maps. For that we use `unit_strides_kind`
-    #  argument and if not supplied we guess depending if we are on the GPU or not.
+    # If no unit stride is given explicitly we assume that it is in the horizontal.
+    # NOTE: Before the optimizer assumed that the memory layout was different for
+    #   GPU (horizontal first) and CPU (vertical first). However this was wrong.
     if unit_strides_kind is None:
-        unit_strides_kind = (
-            gtx_common.DimensionKind.HORIZONTAL if gpu else gtx_common.DimensionKind.VERTICAL
-        )
+        unit_strides_kind = gtx_common.DimensionKind.HORIZONTAL
+
     # It is not possible to use the `unit_strides_dim` argument of the
     #  function, because `LoopBlocking`, if run, changed the name of the
     #  parameter but the dimension can still be identified by its "kind".
@@ -782,11 +782,10 @@ def _gt_auto_configure_maps_and_strides(
     #   get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists,
     #   which requires that the final strides are there. Furthermore, Memlet expansion
     #   has to happen before the GPU block size is set. There are several possible
-    #   solutions for that, of which none is really good. The one that is the least
-    #   bad thing is to set the strides of the transients here. The main downside
-    #   is that this and the `_gt_auto_post_processing()` function has these weird
-    #   names.
-    gtx_transformations.gt_change_strides(sdfg, gpu=gpu)
+    #   solutions for that, of which none is really good. The least bad one is to
+    #   set the strides of the transients here. The main downside is that this and
+    #   the `_gt_auto_post_processing()` function has these weird names.
+    gtx_transformations.gt_change_strides(sdfg)
 
     if gpu:
         # TODO(phimuell): The GPU function might modify the map iteration order.

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py
@@ -33,37 +33,44 @@
 
 def gt_change_strides(
     sdfg: dace.SDFG,
-    gpu: bool,
 ) -> dace.SDFG:
     """Modifies the strides of transients.
 
     The function will analyse the access patterns and set the strides of
-    transients in the optimal way.
-    The function should run after all maps have been created.
+    transients in the optimal way. The function should run after _all_
+    Maps have been created.
+    After the adjustment of the strides they will be propagated into the nested
+    SDFGs, see `gt_propagate_strides_of()` for more.
 
-    After the strides have been adjusted the function will also propagate
-    the strides into nested SDFG, see `gt_propagate_strides_of()` for more.
     Args:
         sdfg: The SDFG to process.
-        gpu: If the SDFG is supposed to run on the GPU.
 
     Note:
         Currently the function will not scan the access pattern. Instead it will
-        either use FORTRAN order for GPU or C order. This function needs to be called
+        translate the memory layout such that the horizontal dimension has stride 1,
+        which is used by the GT4Py allocator. This function needs to be called
         for both CPU and GPU to handle strides of memlets inside nested SDFGs.
-
-    Todo:
-        - Implement the estimation correctly.
+        Furthermore, the current implementation assumes that there is only one
+        horizontal dimension.
     """
-    # TODO(phimeull): Implement this function correctly.
+    # TODO(phimeull): Implement this function correctly, such that it decides the
+    #   order based on the access pattern. Probably also merge it with
+    #   `gt_set_iteration_order()` as the two things are related.
+    # TODO(phimuell): The current implementation assumes that there is only one
+    #   horizontal dimension. If there are multiple horizontal ones then we might
+    #   have a problem.
+    # NOTE: This function builds on the fact that in GT4Py the horizontal dimension
+    #   is always the first dimensions, i.e. column or FORTRAN order and that in
+    #   DaCe the default order (which the lowering uses), is row or C order.
+    #   Thus we just have to inverse the order for all transients and propagate
+    #   the new strides.
 
     for nsdfg in sdfg.all_sdfgs_recursive():
-        _gt_change_strides_non_recursive_impl(nsdfg, gpu)
+        _gt_change_strides_non_recursive_impl(nsdfg)
 
 
 def _gt_change_strides_non_recursive_impl(
     sdfg: dace.SDFG,
-    gpu: bool,
 ) -> None:
     """Set optimal strides of all access nodes in the SDFG.
 
@@ -103,7 +110,7 @@ def _gt_change_strides_non_recursive_impl(
         #  access nodes because the non-transients come from outside and have their
         #  own strides.
         # TODO(phimuell): Set the stride based on the actual access pattern.
-        if desc.transient and gpu:
+        if desc.transient:
             new_stride_order = list(range(ndim))
             desc.set_strides_from_layout(*new_stride_order)
 
@@ -124,7 +131,8 @@ def _gt_change_strides_non_recursive_impl(
             )
 
     # Now handle the views.
-    # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can handle views.
+    # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can
+    #   handle views. However, we should get to a point where we do not have views.
     _gt_modify_strides_of_views_non_recursive(sdfg)