Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -762,33 +762,57 @@ def _gt_auto_configure_maps_and_strides(
For a description of the arguments see the `gt_auto_optimize()` function.
"""

# We now set the iteration order of the Maps. For that we use `unit_strides_kind`
# argument and if not supplied we guess depending if we are on the GPU or not.
if unit_strides_kind is None:
unit_strides_kind = (
gtx_common.DimensionKind.HORIZONTAL if gpu else gtx_common.DimensionKind.VERTICAL
# If `unit_strides_kind` is unknown we will not modify the Map order nor the
# strides, except if we are on GPU. The reason for this is that the maximal
# number of blocks is different for each dimension. If the largest dimension
# is for example associated with the `z` dimension, we would get launch errors
# at some point. Thus in that case we pretend that it is horizontal. Which is
# a valid assumption for any ICON-like code or if the GT4Py allocator is used.
# TODO(phimuell): Make this selection more intelligent.
if unit_strides_kind is None and gpu:
prefered_direction_kind: Optional[gtx_common.DimensionKind] = (
gtx_common.DimensionKind.HORIZONTAL
)
else:
prefered_direction_kind = unit_strides_kind

# We should actually use a `gtx.Dimension` here and not a `gtx.DimensionKind`,
# since they are unique. However at this stage, especially after the expansion
# of non standard Memlets (which happens in the GPU transformation) associating
# Map parameters with GT4Py dimension is very hard to impossible. At this stage
# the kind is the most reliable indicator we have.
# NOTE: This is not the only location where we manipulate the Map order, we also
# do it in the GPU transformation, where we have to set the order of the
# expanded Memlets.
if prefered_direction_kind is not None:
gtx_transformations.gt_set_iteration_order(
sdfg=sdfg,
unit_strides_kind=prefered_direction_kind,
validate=False,
validate_all=validate_all,
)
# It is not possible to use the `unit_strides_dim` argument of the
# function, because `LoopBlocking`, if run, changed the name of the
# parameter but the dimension can still be identified by its "kind".
gtx_transformations.gt_set_iteration_order(
sdfg=sdfg,
unit_strides_kind=unit_strides_kind,
validate=False,
validate_all=validate_all,
)

# NOTE: We have to set the strides of transients before the non-standard Memlets
# get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists,
# which requires that the final strides are there. Furthermore, Memlet expansion
# has to happen before the GPU block size is set. There are several possible
# solutions for that, of which none is really good. The one that is the least
# bad thing is to set the strides of the transients here. The main downside
# is that this and the `_gt_auto_post_processing()` function has these weird
# names.
gtx_transformations.gt_change_strides(sdfg, gpu=gpu)
# get expanded, i.e. turned into Maps because no matching `cudaMemcpy*()` call
# exists, which requires that the final strides are there. Furthermore, Memlet
# expansion has to happen before the GPU block size is set. There are several
# possible solutions for that, of which none is really good. The least bad one
# is to set the strides of the transients here. The main downside is that we
# slightly modify the SDFG in the GPU transformation after we have set the
# strides.
if prefered_direction_kind is not None:
gtx_transformations.gt_change_strides(sdfg, prefered_direction_kind=prefered_direction_kind)

if gpu:
if unit_strides_kind != gtx_common.DimensionKind.HORIZONTAL:
warnings.warn(
"The GT4Py DaCe GPU backend assumes that the leading dimension, i.e."
" where stride is 1, is of kind 'HORIZONTAL', however it was"
f" '{unit_strides_kind}'. Furthermore, it should be the last dimension."
" Other configurations might lead to suboptimal performance.",
stacklevel=2,
)

# TODO(phimuell): The GPU function might modify the map iteration order.
# This is because how it is implemented (promotion and fusion). However,
# because of its current state, this should not happen, but we have to look
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def restrict_fusion_to_newly_created_maps_horizontal(
if len(maps_to_modify) == 0:
return sdfg

# NOTE: This inherently assumes a particular memory order, see `gt_change_strides()`.
for me_to_modify in maps_to_modify:
map_to_modify: dace_nodes.Map = me_to_modify.map
map_to_modify.params = list(reversed(map_to_modify.params))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from dace import data as dace_data
from dace.sdfg import nodes as dace_nodes

from gt4py.next import common as gtx_common
from gt4py.next.program_processors.runners.dace import (
sdfg_args as gtx_dace_args,
transformations as gtx_transformations,
Expand All @@ -33,39 +34,62 @@

def gt_change_strides(
sdfg: dace.SDFG,
gpu: bool,
prefered_direction_kind: gtx_common.DimensionKind,
) -> dace.SDFG:
"""Modifies the strides of transients.

The function will analyse the access patterns and set the strides of
transients in the optimal way.
The function should run after all maps have been created.
transients in the optimal way. The function should run after _all_
Maps have been created.
After the adjustment of the strides they will be propagated into the nested
SDFGs, see `gt_propagate_strides_of()` for more.

After the strides have been adjusted the function will also propagate
the strides into nested SDFG, see `gt_propagate_strides_of()` for more.
Args:
sdfg: The SDFG to process.
gpu: If the SDFG is supposed to run on the GPU.
prefered_direction_kind: `DimensionKind` of the dimension with stride 1.

Note:
Currently the function will not scan the access pattern. Instead it will
either use FORTRAN order for GPU or C order. This function needs to be called
for both CPU and GPU to handle strides of memlets inside nested SDFGs.

Todo:
- Implement the estimation correctly.
- This function should be run after `gt_set_iteration_order()` has been run.
- Currently the function will not scan the access pattern. Instead it relies
on the default behaviour of the lowering and how the GT4Py allocator works.
- The current implementation assumes that there is only one dimension of the
given kind.
"""
# TODO(phimeull): Implement this function correctly.

for nsdfg in sdfg.all_sdfgs_recursive():
_gt_change_strides_non_recursive_impl(nsdfg, gpu)
# TODO(phimeull): Implement this function correctly, such that it decides the
# order based on the access pattern. Probably also merge it with
# `gt_set_iteration_order()` as the two things are related.

# NOTE: This function builds inherently assumes the dimension order defined by
# `gtx_common.order_dimensions()`, the default behaviour of the lowering,
# partially how the GT4Py allocator works and that there is only one dimension
# of any kind (which is true for ICON4Py, but not in general, for example
# in Cartesian grids). Its base assumption is that the ordering (Map parameters
# and strides) generated by the lowering "out of the box" are in row major/C
# order. Because of the GT4Py dimension order this is the right order for
# `gtx_common.DimensionKind.VERTICAL`. If the primary direction kind is
# `HORIZONTAL`, then according to the GT4Py dimension order column major/FORTRAN
# order should be used. To get there we have to reverse the strides order, which
# `_gt_change_strides_non_recursive_impl()` does. This is very brittle but at
# this point the best thing we can do.

match prefered_direction_kind:
case gtx_common.DimensionKind.VERTICAL:
return # Nothing to do in that case. Maybe run Memlet propagation here?

case gtx_common.DimensionKind.HORIZONTAL:
for nsdfg in sdfg.all_sdfgs_recursive():
_gt_change_strides_non_recursive_impl(nsdfg)

case _:
raise ValueError(
f"Encountered unknown `DimensionKind` value: {prefered_direction_kind}"
)


def _gt_change_strides_non_recursive_impl(
sdfg: dace.SDFG,
gpu: bool,
) -> None:
"""Set optimal strides of all access nodes in the SDFG.
"""Set "optimal" strides of all access nodes in the SDFG.

The function will look for all top level access node, see `_gt_find_toplevel_data_accesses()`
and set their strides such that the access is optimal, see Note. The function
Expand All @@ -74,14 +98,9 @@ def _gt_change_strides_non_recursive_impl(
This function should never be called directly but always through `gt_change_strides()`!

Note:
Currently the function just reverses the strides of the data descriptor
of transient access nodes it processes. Since DaCe generates `C` order by default
this lead to FORTRAN order, which is (for now) sufficient to optimize the memory
layout to GPU.

Todo:
Make this function more intelligent to analyse the access pattern and then
figuring out the best order.
This function has the same underlying assumption as they are outlined in
`gt_change_strides()`, see there from more informations about the underlying
assumptions and limitations.
"""
# NOTE: We have to process all access nodes (transient and globals). If we are inside a
# NestedSDFG then they were handled before on the level above us.
Expand All @@ -103,7 +122,7 @@ def _gt_change_strides_non_recursive_impl(
# access nodes because the non-transients come from outside and have their
# own strides.
# TODO(phimuell): Set the stride based on the actual access pattern.
if desc.transient and gpu:
if desc.transient:
new_stride_order = list(range(ndim))
desc.set_strides_from_layout(*new_stride_order)

Expand All @@ -124,7 +143,8 @@ def _gt_change_strides_non_recursive_impl(
)

# Now handle the views.
# TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can handle views.
# TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can
# handle views. However, we should get to a point where we do not have views.
_gt_modify_strides_of_views_non_recursive(sdfg)


Expand Down