diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 486af807ab..08d963f23e 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -796,33 +796,57 @@ def _gt_auto_configure_maps_and_strides( For a description of the arguments see the `gt_auto_optimize()` function. """ - # We now set the iteration order of the Maps. For that we use `unit_strides_kind` - # argument and if not supplied we guess depending if we are on the GPU or not. - if unit_strides_kind is None: - unit_strides_kind = ( - gtx_common.DimensionKind.HORIZONTAL if gpu else gtx_common.DimensionKind.VERTICAL + # If `unit_strides_kind` is unknown we will not modify the Map order nor the + # strides, except if we are on GPU. The reason for this is that the maximal + # number of blocks is different for each dimension. If the largest dimension + # is for example associated with the `z` dimension, we would get launch errors + # at some point. Thus in that case we pretend that it is horizontal. Which is + # a valid assumption for any ICON-like code or if the GT4Py allocator is used. + # TODO(phimuell): Make this selection more intelligent. + if unit_strides_kind is None and gpu: + prefered_direction_kind: Optional[gtx_common.DimensionKind] = ( + gtx_common.DimensionKind.HORIZONTAL + ) + else: + prefered_direction_kind = unit_strides_kind + + # We should actually use a `gtx.Dimension` here and not a `gtx.DimensionKind`, + # since they are unique. However at this stage, especially after the expansion + # of non standard Memlets (which happens in the GPU transformation) associating + # Map parameters with GT4Py dimension is very hard to impossible. At this stage + # the kind is the most reliable indicator we have. + # NOTE: This is not the only location where we manipulate the Map order, we also + # do it in the GPU transformation, where we have to set the order of the + # expanded Memlets. + if prefered_direction_kind is not None: + gtx_transformations.gt_set_iteration_order( + sdfg=sdfg, + unit_strides_kind=prefered_direction_kind, + validate=False, + validate_all=validate_all, ) - # It is not possible to use the `unit_strides_dim` argument of the - # function, because `LoopBlocking`, if run, changed the name of the - # parameter but the dimension can still be identified by its "kind". - gtx_transformations.gt_set_iteration_order( - sdfg=sdfg, - unit_strides_kind=unit_strides_kind, - validate=False, - validate_all=validate_all, - ) # NOTE: We have to set the strides of transients before the non-standard Memlets - # get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists, - # which requires that the final strides are there. Furthermore, Memlet expansion - # has to happen before the GPU block size is set. There are several possible - # solutions for that, of which none is really good. The one that is the least - # bad thing is to set the strides of the transients here. The main downside - # is that this and the `_gt_auto_post_processing()` function has these weird - # names. - gtx_transformations.gt_change_strides(sdfg, gpu=gpu) + # get expanded, i.e. turned into Maps because no matching `cudaMemcpy*()` call + # exists, which requires that the final strides are there. Furthermore, Memlet + # expansion has to happen before the GPU block size is set. There are several + # possible solutions for that, of which none is really good. The least bad one + # is to set the strides of the transients here. The main downside is that we + # slightly modify the SDFG in the GPU transformation after we have set the + # strides. + if prefered_direction_kind is not None: + gtx_transformations.gt_change_strides(sdfg, prefered_direction_kind=prefered_direction_kind) if gpu: + if unit_strides_kind != gtx_common.DimensionKind.HORIZONTAL: + warnings.warn( + "The GT4Py DaCe GPU backend assumes that the leading dimension, i.e." + " where stride is 1, is of kind 'HORIZONTAL', however it was" + f" '{unit_strides_kind}'. Furthermore, it should be the last dimension." + " Other configurations might lead to suboptimal performance.", + stacklevel=2, + ) + # TODO(phimuell): The GPU function might modify the map iteration order. # This is because how it is implemented (promotion and fusion). However, # because of its current state, this should not happen, but we have to look diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py index 1786913edb..ac81d2cd64 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py @@ -241,6 +241,7 @@ def restrict_fusion_to_newly_created_maps_horizontal( if len(maps_to_modify) == 0: return sdfg + # NOTE: This inherently assumes a particular memory order, see `gt_change_strides()`. for me_to_modify in maps_to_modify: map_to_modify: dace_nodes.Map = me_to_modify.map map_to_modify.params = list(reversed(map_to_modify.params)) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py index 928fa04d54..0840f77755 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py @@ -12,6 +12,7 @@ from dace import data as dace_data from dace.sdfg import nodes as dace_nodes +from gt4py.next import common as gtx_common from gt4py.next.program_processors.runners.dace import ( sdfg_args as gtx_dace_args, transformations as gtx_transformations, @@ -33,39 +34,62 @@ def gt_change_strides( sdfg: dace.SDFG, - gpu: bool, + prefered_direction_kind: gtx_common.DimensionKind, ) -> dace.SDFG: """Modifies the strides of transients. The function will analyse the access patterns and set the strides of - transients in the optimal way. - The function should run after all maps have been created. + transients in the optimal way. The function should run after _all_ + Maps have been created. + After the adjustment of the strides they will be propagated into the nested + SDFGs, see `gt_propagate_strides_of()` for more. - After the strides have been adjusted the function will also propagate - the strides into nested SDFG, see `gt_propagate_strides_of()` for more. Args: sdfg: The SDFG to process. - gpu: If the SDFG is supposed to run on the GPU. + prefered_direction_kind: `DimensionKind` of the dimension with stride 1. Note: - Currently the function will not scan the access pattern. Instead it will - either use FORTRAN order for GPU or C order. This function needs to be called - for both CPU and GPU to handle strides of memlets inside nested SDFGs. - - Todo: - - Implement the estimation correctly. + - This function should be run after `gt_set_iteration_order()` has been run. + - Currently the function will not scan the access pattern. Instead it relies + on the default behaviour of the lowering and how the GT4Py allocator works. + - The current implementation assumes that there is only one dimension of the + given kind. """ - # TODO(phimeull): Implement this function correctly. - - for nsdfg in sdfg.all_sdfgs_recursive(): - _gt_change_strides_non_recursive_impl(nsdfg, gpu) + # TODO(phimeull): Implement this function correctly, such that it decides the + # order based on the access pattern. Probably also merge it with + # `gt_set_iteration_order()` as the two things are related. + + # NOTE: This function builds inherently assumes the dimension order defined by + # `gtx_common.order_dimensions()`, the default behaviour of the lowering, + # partially how the GT4Py allocator works and that there is only one dimension + # of any kind (which is true for ICON4Py, but not in general, for example + # in Cartesian grids). Its base assumption is that the ordering (Map parameters + # and strides) generated by the lowering "out of the box" are in row major/C + # order. Because of the GT4Py dimension order this is the right order for + # `gtx_common.DimensionKind.VERTICAL`. If the primary direction kind is + # `HORIZONTAL`, then according to the GT4Py dimension order column major/FORTRAN + # order should be used. To get there we have to reverse the strides order, which + # `_gt_change_strides_non_recursive_impl()` does. This is very brittle but at + # this point the best thing we can do. + + match prefered_direction_kind: + case gtx_common.DimensionKind.VERTICAL: + return # Nothing to do in that case. Maybe run Memlet propagation here? + + case gtx_common.DimensionKind.HORIZONTAL: + for nsdfg in sdfg.all_sdfgs_recursive(): + _gt_change_strides_non_recursive_impl(nsdfg) + + case _: + raise ValueError( + f"Encountered unknown `DimensionKind` value: {prefered_direction_kind}" + ) def _gt_change_strides_non_recursive_impl( sdfg: dace.SDFG, - gpu: bool, ) -> None: - """Set optimal strides of all access nodes in the SDFG. + """Set "optimal" strides of all access nodes in the SDFG. The function will look for all top level access node, see `_gt_find_toplevel_data_accesses()` and set their strides such that the access is optimal, see Note. The function @@ -74,14 +98,9 @@ def _gt_change_strides_non_recursive_impl( This function should never be called directly but always through `gt_change_strides()`! Note: - Currently the function just reverses the strides of the data descriptor - of transient access nodes it processes. Since DaCe generates `C` order by default - this lead to FORTRAN order, which is (for now) sufficient to optimize the memory - layout to GPU. - - Todo: - Make this function more intelligent to analyse the access pattern and then - figuring out the best order. + This function has the same underlying assumption as they are outlined in + `gt_change_strides()`, see there from more informations about the underlying + assumptions and limitations. """ # NOTE: We have to process all access nodes (transient and globals). If we are inside a # NestedSDFG then they were handled before on the level above us. @@ -103,7 +122,7 @@ def _gt_change_strides_non_recursive_impl( # access nodes because the non-transients come from outside and have their # own strides. # TODO(phimuell): Set the stride based on the actual access pattern. - if desc.transient and gpu: + if desc.transient: new_stride_order = list(range(ndim)) desc.set_strides_from_layout(*new_stride_order) @@ -124,7 +143,8 @@ def _gt_change_strides_non_recursive_impl( ) # Now handle the views. - # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can handle views. + # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can + # handle views. However, we should get to a point where we do not have views. _gt_modify_strides_of_views_non_recursive(sdfg)