From 9715e393c7ba3fe339253728740fbe106046b863 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Wed, 26 Jul 2023 18:47:24 +0000 Subject: [PATCH 1/6] Tune partition on SM80 --- .../dispatch/tuning/tuning_select_if.cuh | 190 +++++++++++++++++- 1 file changed, 188 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 171d2d598d7..6b5119c2725 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -33,6 +33,7 @@ #include #include #include +#include "cub/util_device.cuh" CUB_NAMESPACE_BEGIN @@ -382,12 +383,172 @@ struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4 }; #endif + +template (), + input_size InputSize = classify_input_size()> +struct sm80_tuning +{ + static constexpr int threads = 128; + + static constexpr int nominal_4b_items_per_thread = 10; + + static constexpr int items = + CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(InputT)))); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<350, 450>; +}; + +// partition::if +template +struct sm80_tuning +{ + static constexpr int threads = 512; + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<510>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 224; + static constexpr int items = 18; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1045>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + static constexpr int items = 15; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1040>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + static constexpr int items = 10; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<68, 1160>; +}; + +#if CUB_IS_INT128_ENABLED +template <> +struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; +}; + +template <> +struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; +}; +#endif + +// partition::flagged +template +struct sm80_tuning +{ + static constexpr int threads = 512; + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<595>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 224; + static constexpr int items = 18; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1105>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + static constexpr int items = 12; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<912, 1025>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + static constexpr int items = 12; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<884, 1130>; +}; + +#if CUB_IS_INT128_ENABLED +template <> +struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; +}; + +template <> +struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; +}; +#endif + } // namespace select template struct device_select_policy_hub { - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 10; @@ -403,7 +564,32 @@ struct device_select_policy_hub detail::fixed_delay_constructor_t<350, 450>>; }; - struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + struct Policy350 + : DefaultTuning + , ChainedPolicy<350, Policy350, Policy350> + {}; + + struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + { + using tuning = detail::select::sm80_tuning(), + select::are_rejects_kept(), + select::classify_offset_size()>; + + using SelectIfPolicyT = AgentSelectIfPolicy; + }; + + struct Policy860 + : DefaultTuning + , ChainedPolicy<860, Policy860, Policy800> + {}; + + struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::select::sm90_tuning(), From 829e711f0ce04001b3740dbb2f459f301e77103b Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 29 Jul 2023 16:23:37 +0000 Subject: [PATCH 2/6] Tune select on SM80 --- .../dispatch/tuning/tuning_select_if.cuh | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 6b5119c2725..7a6e14bde61 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -405,6 +405,144 @@ struct sm80_tuning using delay_constructor = detail::fixed_delay_constructor_t<350, 450>; }; +// select::if +template +struct sm80_tuning +{ + static constexpr int threads = 992; + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<395>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 576; + static constexpr int items = 14; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<870>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + static constexpr int items = 18; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1130>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + static constexpr int items = 10; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<832, 1165>; +}; + +#if CUB_IS_INT128_ENABLED +template <> +struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 384; + static constexpr int items = 4; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1140>; +}; + +template <> +struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 384; + static constexpr int items = 4; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1140>; +}; +#endif + +// select::flagged +template +struct sm80_tuning +{ + static constexpr int threads = 224; + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<735>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1155>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 320; + static constexpr int items = 10; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<124, 1115>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 384; + static constexpr int items = 6; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1130>; +}; + +#if CUB_IS_INT128_ENABLED +template <> +struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>; +}; + +template <> +struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> +{ + static constexpr int threads = 256; + static constexpr int items = 5; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>; +}; +#endif + // partition::if template struct sm80_tuning From 0c428c27b1e46dca704153544a83b9b6e48f04fd Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 31 Jul 2023 07:35:12 +0000 Subject: [PATCH 3/6] Tune three-way partition on SM80 --- .../tuning/tuning_three_way_partition.cuh | 89 ++++++++++++++++++- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index a1b3c43dc9e..3b92b31e4c4 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -205,13 +205,72 @@ struct sm90_tuning using delay_constructor = detail::no_delay_constructor_t<1050>; }; +template (), + offset_size OffsetSize = classify_offset_size()> +struct sm80_tuning +{ + static constexpr int threads = 256; + static constexpr int items = Nominal4BItemsToItems(9); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; + using AccumPackT = typename AccumPackHelperT::pack_t; + using delay_constructor = detail::default_delay_constructor_t; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + static constexpr int items = 12; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<910>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + static constexpr int items = 11; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1120>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 224; + static constexpr int items = 11; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<264, 1080>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 128; + static constexpr int items = 10; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<672, 1120>; +}; + } // namespace three_way_partition template struct device_three_way_partition_policy_hub { - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct DefaultTuning { constexpr static int ITEMS_PER_THREAD = Nominal4BItemsToItems(9); @@ -222,8 +281,32 @@ struct device_three_way_partition_policy_hub cub::BLOCK_SCAN_WARP_SCANS>; }; + /// SM35 + struct Policy350 + : DefaultTuning + , ChainedPolicy<350, Policy350, Policy350> + {}; + + struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + { + using tuning = detail::three_way_partition::sm80_tuning; + + using ThreeWayPartitionPolicy = + AgentThreeWayPartitionPolicy; + }; + + struct Policy860 + : DefaultTuning + , ChainedPolicy<860, Policy860, Policy800> + {}; + /// SM90 - struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::three_way_partition::sm90_tuning; From 143942ada1a4c2057075e982bb0e511515b837c8 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 31 Jul 2023 09:22:07 +0000 Subject: [PATCH 4/6] Fix SM90 three-way partition tuning --- .../dispatch/tuning/tuning_three_way_partition.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index 3b92b31e4c4..8c8fabe79e0 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -134,7 +134,7 @@ struct sm90_tuning static constexpr int threads = 384; static constexpr int items = 7; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<464, 1165>; }; @@ -145,7 +145,7 @@ struct sm90_tuning static constexpr int threads = 128; static constexpr int items = 7; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1040>; }; @@ -167,7 +167,7 @@ struct sm90_tuning static constexpr int threads = 640; static constexpr int items = 24; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<245>; }; @@ -178,7 +178,7 @@ struct sm90_tuning static constexpr int threads = 256; static constexpr int items = 23; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<910>; }; @@ -189,7 +189,7 @@ struct sm90_tuning static constexpr int threads = 256; static constexpr int items = 18; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1145>; }; @@ -200,7 +200,7 @@ struct sm90_tuning static constexpr int threads = 256; static constexpr int items = 11; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1050>; }; From 20770b733fd80ceda2a5affd13d6960f0d650f74 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 31 Jul 2023 09:22:52 +0000 Subject: [PATCH 5/6] Fix SM90 select tuning --- cub/cub/device/dispatch/tuning/tuning_select_if.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 7a6e14bde61..aa9c6eae406 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -125,7 +125,7 @@ struct sm90_tuning; }; @@ -136,7 +136,7 @@ struct sm90_tuning; }; @@ -147,7 +147,7 @@ struct sm90_tuning; }; @@ -285,7 +285,7 @@ struct sm90_tuning; }; @@ -297,7 +297,7 @@ struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, static constexpr int threads = 192; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>; }; @@ -308,7 +308,7 @@ struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, static constexpr int threads = 192; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>; }; From 1df02a4bf13bb5b9f66c239ab2110470bb0e22e6 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 31 Jul 2023 09:23:32 +0000 Subject: [PATCH 6/6] Fix SM80 select tuning --- .../dispatch/tuning/tuning_select_if.cuh | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index aa9c6eae406..27247ca70e5 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -33,7 +33,6 @@ #include #include #include -#include "cub/util_device.cuh" CUB_NAMESPACE_BEGIN @@ -434,7 +433,7 @@ struct sm80_tuning; }; @@ -445,7 +444,7 @@ struct sm80_tuning; }; @@ -492,7 +491,7 @@ struct sm80_tuning; }; @@ -561,7 +560,7 @@ struct sm80_tuning; }; @@ -583,7 +582,7 @@ struct sm80_tuning; }; @@ -595,7 +594,7 @@ struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, static constexpr int threads = 256; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; @@ -606,7 +605,7 @@ struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, static constexpr int threads = 256; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; @@ -630,7 +629,7 @@ struct sm80_tuning; }; @@ -652,7 +651,7 @@ struct sm80_tuning; }; @@ -664,7 +663,7 @@ struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, static constexpr int threads = 256; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; @@ -675,7 +674,7 @@ struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4 static constexpr int threads = 256; static constexpr int items = 5; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_TRANSPOSE; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; };