diff --git a/benchmarks/bench/run_length_encode/encode.cu b/benchmarks/bench/run_length_encode/encode.cu index 286c1da8c..2dc94309b 100644 --- a/benchmarks/bench/run_length_encode/encode.cu +++ b/benchmarks/bench/run_length_encode/encode.cu @@ -91,6 +91,7 @@ static void rle(nvbench::state &state, nvbench::type_list) accum_t, device_reduce_by_key_policy_hub>; #else + using policy_t = cub::detail::device_run_length_encode_policy_hub; using dispatch_t = cub::DispatchReduceByKey) equality_op_t, reduction_op_t, offset_t, - accum_t>; + accum_t, + policy_t>; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); diff --git a/cub/device/device_run_length_encode.cuh b/cub/device/device_run_length_encode.cuh index 813343d65..2a8a546bd 100644 --- a/cub/device/device_run_length_encode.cuh +++ b/cub/device/device_run_length_encode.cuh @@ -34,14 +34,16 @@ #pragma once -#include -#include - #include #include #include +#include #include +#include + +#include + CUB_NAMESPACE_BEGIN @@ -204,36 +206,44 @@ struct DeviceRunLengthEncode int num_items, cudaStream_t stream = 0) { - using OffsetT = int; // Signed integer type for global offsets - using EqualityOp = Equality; // Default == operator - using ReductionOp = cub::Sum; // Value reduction operator + using offset_t = int; // Signed integer type for global offsets + using equality_op = Equality; // Default == operator + using reduction_op = cub::Sum; // Value reduction operator // The lengths output value type - using LengthT = - cub::detail::non_void_value_t; + using length_t = + cub::detail::non_void_value_t; // Generator type for providing 1s values for run-length reduction - using LengthsInputIteratorT = ConstantInputIterator; + using lengths_input_iterator_t = ConstantInputIterator; + + using accum_t = detail::accumulator_t; + + using key_t = + cub::detail::non_void_value_t>; + + using policy_t = detail::device_run_length_encode_policy_hub; return DispatchReduceByKey::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - LengthsInputIteratorT( - (LengthT)1), - d_counts_out, - d_num_runs_out, - EqualityOp(), - ReductionOp(), - num_items, - stream); + equality_op, + reduction_op, + offset_t, + accum_t, + policy_t>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + lengths_input_iterator_t((length_t)1), + d_counts_out, + d_num_runs_out, + equality_op(), + reduction_op(), + num_items, + stream); } template #include #include +#include #include #include #include @@ -150,38 +151,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * Dispatch ******************************************************************************/ -namespace detail -{ - -template -struct device_rle_policy_hub -{ - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, - CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - using RleSweepPolicyT = - AgentRlePolicy<96, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS, - detail::default_reduce_by_key_delay_constructor_t>; - }; - - using MaxPolicy = Policy350; -}; - -} // namespace detail - /** * Utility class for dispatching the appropriately-tuned kernels for DeviceRle * @@ -213,8 +182,9 @@ template >> + typename SelectedPolicy = detail::device_non_trivial_runs_policy_hub< + cub::detail::non_void_value_t, + cub::detail::value_t>> struct DeviceRleDispatch { /****************************************************************************** diff --git a/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/device/dispatch/tuning/tuning_run_length_encode.cuh new file mode 100644 index 000000000..ba3780f2a --- /dev/null +++ b/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -0,0 +1,392 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +CUB_NAMESPACE_BEGIN + +namespace detail +{ + +namespace rle +{ + +enum class primitive_key { no, yes }; +enum class primitive_length { no, yes }; +enum class key_size { _1, _2, _4, _8, _16, unknown }; +enum class length_size { _4, unknown }; + +template +constexpr primitive_key is_primitive_key() +{ + return Traits::PRIMITIVE ? primitive_key::yes : primitive_key::no; +} + +template +constexpr primitive_length is_primitive_length() +{ + return Traits::PRIMITIVE ? primitive_length::yes : primitive_length::no; +} + +template +constexpr key_size classify_key_size() +{ + return sizeof(KeyT) == 1 ? key_size::_1 + : sizeof(KeyT) == 2 ? key_size::_2 + : sizeof(KeyT) == 4 ? key_size::_4 + : sizeof(KeyT) == 8 ? key_size::_8 + : sizeof(KeyT) == 16 ? key_size::_16 + : key_size::unknown; +} + +template +constexpr length_size classify_length_size() +{ + return sizeof(LengthT) == 4 ? length_size::_4 : length_size::unknown; +} + +namespace encode { + +template (), + primitive_key PrimitiveKey = is_primitive_key(), + length_size LengthSize = classify_length_size(), + key_size KeySize = classify_key_size()> +struct sm90_tuning +{ + static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); + static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); + + static constexpr int threads = 128; + + static constexpr int nominal_4b_items_per_thread = 6; + + static constexpr int items = + (max_input_bytes <= 8) + ? 6 + : CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, + ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / + combined_input_bytes)); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<620>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 22; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<775>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 192; + + static constexpr int items = 14; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<284, 480>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 19; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<515>; +}; + +#if CUB_IS_INT128_ENABLED +template +struct sm90_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 11; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<428, 930>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 11; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::fixed_delay_constructor_t<428, 930>; +}; +#endif + +} // namespace encode + +namespace non_trivial_runs +{ + +template (), + primitive_key PrimitiveKey = is_primitive_key(), + length_size LengthSize = classify_length_size(), + key_size KeySize = classify_key_size()> +struct sm90_tuning +{ + static constexpr int threads = 96; + + static constexpr int nominal_4b_items_per_thread = 15; + + static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT)))); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = true; + + using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 18; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<385>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 224; + + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<675>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 18; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<695>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 224; + + static constexpr int items = 14; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<840>; +}; + +#if CUB_IS_INT128_ENABLED +template +struct sm90_tuning +{ + static constexpr int threads = 288; + + static constexpr int items = 9; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 288; + + static constexpr int items = 9; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>; +}; +#endif + +} // namespace non_trivial_runs + + +} // namespace rle + +template +struct device_run_length_encode_policy_hub +{ + static constexpr int MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); + static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(LengthT); + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + { + static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; + static constexpr int ITEMS_PER_THREAD = + (MAX_INPUT_BYTES <= 8) + ? 6 + : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, + CUB_MAX(1, + ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / + COMBINED_INPUT_BYTES)); + + using ReduceByKeyPolicyT = + AgentReduceByKeyPolicy<128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS, + detail::default_reduce_by_key_delay_constructor_t>; + }; + + /// SM90 + struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + { + using tuning = detail::rle::encode::sm90_tuning; + + using ReduceByKeyPolicyT = + AgentReduceByKeyPolicy; + }; + + using MaxPolicy = Policy900; +}; + +template +struct device_non_trivial_runs_policy_hub +{ + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + { + enum + { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, + CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(KeyT)))), + }; + + using RleSweepPolicyT = + AgentRlePolicy<96, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS, + detail::default_reduce_by_key_delay_constructor_t>; + }; + + // SM90 + struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + { + using tuning = detail::rle::non_trivial_runs::sm90_tuning; + + using RleSweepPolicyT = + AgentRlePolicy; + }; + + using MaxPolicy = Policy900; +}; + +} // namespace detail + + +CUB_NAMESPACE_END