From f2cd25edd7821ab3affcfd65db76a258e47775ac Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 12 Jun 2023 10:56:26 +0400 Subject: [PATCH] Tune 8-byte types --- cub/device/dispatch/dispatch_select_if.cuh | 46 ++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index c64885dc5..2367e4848 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -167,6 +167,40 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA namespace detail { +namespace select +{ + +template ::PRIMITIVE, + std::size_t InputSize = sizeof(InputT)> +struct sm90_tuning +{ + static constexpr int threads = 128; + + static constexpr int nominal_4b_items_per_thread = 10; + + static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, (nominal_4b_items_per_thread * 4 / InputSize))); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::fixed_delay_constructor_t<350, 450>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 384; + + static constexpr int items = 22; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1120>; +}; + +} + template struct device_select_policy_hub { @@ -186,6 +220,18 @@ struct device_select_policy_hub detail::fixed_delay_constructor_t<350, 450>>; }; + struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + { + using tuning = detail::select::sm90_tuning; + + using SelectIfPolicyT = AgentSelectIfPolicy; + }; + using MaxPolicy = Policy350; };