From 6aab5c908ea5016f0dbde2351c016fa70d4b70de Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 6 May 2023 18:45:21 +0400 Subject: [PATCH] Add partition --- benchmarks/README.md | 9 +- benchmarks/bench/partition/flagged.cu | 141 ++++++++++++++++++++++ benchmarks/bench/partition/if.cu | 165 ++++++++++++++++++++++++++ 3 files changed, 309 insertions(+), 6 deletions(-) create mode 100644 benchmarks/bench/partition/flagged.cu create mode 100644 benchmarks/bench/partition/if.cu diff --git a/benchmarks/README.md b/benchmarks/README.md index 257c666db..d6555e6ad 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -14,12 +14,12 @@ - scan - sum - max - -### In Progress - - select - flagged - if +- partition + - flagged + - if ### TODO @@ -29,9 +29,6 @@ - histogram - even - range -- partition - - flagged - - if - rle - encode - non trivial runs diff --git a/benchmarks/bench/partition/flagged.cu b/benchmarks/bench/partition/flagged.cu new file mode 100644 index 000000000..6f59df8a9 --- /dev/null +++ b/benchmarks/bench/partition/flagged.cu @@ -0,0 +1,141 @@ +#include +#include + +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_LOAD ld 0:1:1 +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 +// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 + +constexpr bool keep_rejects = true; +constexpr bool may_alias = false; + +#if !TUNE_BASE +#if TUNE_TRANSPOSE == 0 +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +#else +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +#endif + +#if TUNE_LOAD == 0 +#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +#else +#define TUNE_LOAD_MODIFIER cub::LOAD_CA +#endif + +template +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; + + static constexpr int ITEMS_PER_THREAD = + CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, + CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + + using SelectIfPolicyT = cub::AgentSelectIfPolicy; + }; + + using MaxPolicy = policy_t; +}; +#endif + +template +void partition(nvbench::state &state, nvbench::type_list) +{ + using input_it_t = const T*; + using flag_it_t = const bool*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = cub::NullType; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; + +#if !TUNE_BASE + using policy_t = policy_hub_t; + using dispatch_t = cub::DispatchSelectIf; +#else + using dispatch_t = cub::DispatchSelectIf; +#endif + + // Retrieve axis parameters + const auto elements = static_cast(state.get_int64("Elements{io}")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + thrust::device_vector in(elements); + thrust::device_vector flags(elements); + thrust::device_vector num_selected(1); + + gen(seed_t{}, in); + gen(seed_t{1}, flags, entropy); + + thrust::device_vector out(elements); + + input_it_t d_in = thrust::raw_pointer_cast(in.data()); + flag_it_t d_flags = thrust::raw_pointer_cast(flags.data()); + output_it_t d_out = thrust::raw_pointer_cast(out.data()); + num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + state.add_global_memory_writes(1); + + std::size_t temp_size{}; + dispatch_t::Dispatch(nullptr, + temp_size, + d_in, + d_flags, + d_out, + d_num_selected, + select_op_t{}, + equality_op_t{}, + elements, + 0); + + thrust::device_vector temp(temp_size); + auto *temp_storage = thrust::raw_pointer_cast(temp.data()); + + state.exec([&](nvbench::launch &launch) { + dispatch_t::Dispatch(temp_storage, + temp_size, + d_in, + d_flags, + d_out, + d_num_selected, + select_op_t{}, + equality_op_t{}, + elements, + launch.get_stream()); + }); +} + +NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) + .set_name("cub::DevicePartition::Flagged") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201", "0.000"}); diff --git a/benchmarks/bench/partition/if.cu b/benchmarks/bench/partition/if.cu new file mode 100644 index 000000000..8a2b9fae7 --- /dev/null +++ b/benchmarks/bench/partition/if.cu @@ -0,0 +1,165 @@ +#include +#include + +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_LOAD ld 0:1:1 +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 +// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 + +constexpr bool keep_rejects = true; +constexpr bool may_alias = false; + +#if !TUNE_BASE +#if TUNE_TRANSPOSE == 0 +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +#else +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +#endif + +#if TUNE_LOAD == 0 +#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +#else +#define TUNE_LOAD_MODIFIER cub::LOAD_CA +#endif + +template +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; + + static constexpr int ITEMS_PER_THREAD = + CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, + CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + + using SelectIfPolicyT = cub::AgentSelectIfPolicy; + }; + + using MaxPolicy = policy_t; +}; +#endif + +template +struct less_then_t +{ + T m_val; + + __device__ bool operator()(const T &val) const { return val < m_val; } +}; + +template +T value_from_entropy(double percentage) +{ + if (percentage == 1) + { + return std::numeric_limits::max(); + } + + const auto max_val = static_cast(std::numeric_limits::max()); + const auto min_val = static_cast(std::numeric_limits::lowest()); + const auto result = min_val + percentage * max_val - percentage * min_val; + return static_cast(result); +} + +template +void partition(nvbench::state &state, nvbench::type_list) +{ + using input_it_t = const T*; + using flag_it_t = cub::NullType*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = less_then_t; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; + +#if !TUNE_BASE + using policy_t = policy_hub_t; + using dispatch_t = cub::DispatchSelectIf; +#else + using dispatch_t = cub::DispatchSelectIf; +#endif + + // Retrieve axis parameters + const auto elements = static_cast(state.get_int64("Elements{io}")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + T min_val = std::numeric_limits::lowest(); + T max_val = std::numeric_limits::max(); + T val = value_from_entropy(entropy_to_probability(entropy)); + select_op_t select_op{val}; + + thrust::device_vector in(elements); + thrust::device_vector num_selected(1); + + gen(seed_t{}, in); + + thrust::device_vector out(elements); + + input_it_t d_in = thrust::raw_pointer_cast(in.data()); + flag_it_t d_flags = nullptr; + output_it_t d_out = thrust::raw_pointer_cast(out.data()); + num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + state.add_global_memory_writes(1); + + std::size_t temp_size{}; + dispatch_t::Dispatch(nullptr, + temp_size, + d_in, + d_flags, + d_out, + d_num_selected, + select_op, + equality_op_t{}, + elements, + 0); + + thrust::device_vector temp(temp_size); + auto *temp_storage = thrust::raw_pointer_cast(temp.data()); + + state.exec([&](nvbench::launch &launch) { + dispatch_t::Dispatch(temp_storage, + temp_size, + d_in, + d_flags, + d_out, + d_num_selected, + select_op, + equality_op_t{}, + elements, + launch.get_stream()); + }); +} + +NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) + .set_name("cub::DevicePartition::If") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201", "0.000"});