From 8a6b822726735288abb97f8de0f54f409f1db0eb Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 6 Jun 2023 09:27:43 +0400 Subject: [PATCH] Introduce SM90 tuning policy into scan --- benchmarks/scripts/analyze.py | 2 +- cub/device/dispatch/dispatch_scan.cuh | 65 +++++++++++++++++++++++++-- cub/thread/thread_operators.cuh | 27 +++++++++++ 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/benchmarks/scripts/analyze.py b/benchmarks/scripts/analyze.py index ee237b5338..50f28dd8b2 100755 --- a/benchmarks/scripts/analyze.py +++ b/benchmarks/scripts/analyze.py @@ -221,7 +221,7 @@ def iterate_case_dfs(args, callable): for gpu in ctk_cub_df['gpu'].unique(): target_df = ctk_cub_df[ctk_cub_df['gpu'] == gpu] - target_df.drop(columns=['ctk', 'cub', 'gpu'], inplace=True) + target_df = target_df.drop(columns=['ctk', 'cub', 'gpu']) target_df = compute_speedup(target_df) for ct_point in ct_space(target_df): diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 0ba4f5b2c8..1529e21bbb 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -199,7 +199,51 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) * Policy ******************************************************************************/ -template ///< Data type +namespace detail +{ +namespace scan +{ + +template +struct tuning +{ + static constexpr int threads = Threads; + static constexpr int items = Items; + + using delay_constructor = detail::fixed_delay_constructor_t; +}; + +template ::PRIMITIVE, + std::size_t AccumSize = sizeof(AccumT)> +struct sm90_tuning +{ + static constexpr int threads = 128; + static constexpr int items = 15; + + using delay_constructor = detail::default_delay_constructor_t; +}; + +// clang-format off +template struct sm90_tuning : tuning<192, 22, 168, 1140> {}; +template struct sm90_tuning : tuning<512, 12, 376, 1125> {}; +template struct sm90_tuning : tuning<128, 24, 648, 1245> {}; +template struct sm90_tuning : tuning<224, 24, 632, 1290> {}; + +template <> struct sm90_tuning : tuning<128, 24, 688, 1140> {}; +template <> struct sm90_tuning : tuning<224, 24, 576, 1215> {}; + +#if CUB_IS_INT128_ENABLED +template <> struct sm90_tuning< __int128_t, true, false, sizeof(__int128_t)> : tuning<576, 21, 860, 630> {}; +template <> struct sm90_tuning<__uint128_t, true, false, sizeof(__uint128_t)> : tuning<576, 21, 860, 630> {}; +#endif +// clang-format on + +} // namespace scan +} // namespace detail + +template struct DeviceScanPolicy { // For large values, use timesliced loads/stores to fit shared memory. @@ -271,7 +315,22 @@ struct DeviceScanPolicy detail::default_delay_constructor_t>; }; - using MaxPolicy = Policy600; + /// SM900 + struct Policy900 : ChainedPolicy<900, Policy900, Policy600> + { + using tuning = detail::scan::sm90_tuning::value>; + + using ScanPolicyT = policy_t; + }; + + using MaxPolicy = Policy900; }; /****************************************************************************** @@ -312,7 +371,7 @@ template , typename InitValueT::value_type>, cub::detail::value_t>, - typename SelectedPolicy = DeviceScanPolicy> + typename SelectedPolicy = DeviceScanPolicy> struct DispatchScan : SelectedPolicy { //--------------------------------------------------------------------- diff --git a/cub/thread/thread_operators.cuh b/cub/thread/thread_operators.cuh index 60798dba8d..c6b097ea35 100644 --- a/cub/thread/thread_operators.cuh +++ b/cub/thread/thread_operators.cuh @@ -215,6 +215,33 @@ struct ArgMin } }; +namespace detail +{ +template +struct basic_binary_op_t +{ + static constexpr bool value = false; +}; + +template <> +struct basic_binary_op_t +{ + static constexpr bool value = true; +}; + +template <> +struct basic_binary_op_t +{ + static constexpr bool value = true; +}; + +template <> +struct basic_binary_op_t +{ + static constexpr bool value = true; +}; +} // namespace detail + /// @brief Default cast functor template struct CastOp