From c4ed14d81ff31bf4d33a3fcfbe46234572676892 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sun, 7 May 2023 00:01:13 +0400 Subject: [PATCH] Add scan by key --- benchmarks/README.md | 10 +- benchmarks/bench/scan/exclusive/base.cuh | 6 +- benchmarks/bench/scan/exclusive/by_key.cu | 129 ++++++++++++++++++++++ 3 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 benchmarks/bench/scan/exclusive/by_key.cu diff --git a/benchmarks/README.md b/benchmarks/README.md index d6555e6ad..d3e514b99 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -20,20 +20,20 @@ - partition - flagged - if +- scan + - by key ### TODO - adjacent difference - left - right -- histogram +- histogram : needs policy - even - range -- rle +- rle : needs policy - encode - non trivial runs -- reduce - - by key -- scan +- reduce : needs policy - by key - segmented diff --git a/benchmarks/bench/scan/exclusive/base.cuh b/benchmarks/bench/scan/exclusive/base.cuh index 0f2313eae..a18b733c3 100644 --- a/benchmarks/bench/scan/exclusive/base.cuh +++ b/benchmarks/bench/scan/exclusive/base.cuh @@ -27,7 +27,6 @@ constexpr std::size_t max_temp_storage_size() using input_it_t = const T *; using output_it_t = T *; using offset_t = OffsetT; - using output_t = T; using init_t = cub::detail::InputValue; using policy_t = typename policy_hub_t::policy_t; using real_init_t = typename init_t::value_type; @@ -61,12 +60,11 @@ static void basic(std::integral_constant, nvbench::state &state, nvbench::type_list) { - using accum_t = T; + using init_t = T; + using accum_t = cub::detail::accumulator_t; using input_it_t = const T *; using output_it_t = T *; using offset_t = OffsetT; - using output_t = T; - using init_t = cub::detail::InputValue; #if !TUNE_BASE using policy_t = policy_hub_t; diff --git a/benchmarks/bench/scan/exclusive/by_key.cu b/benchmarks/bench/scan/exclusive/by_key.cu new file mode 100644 index 000000000..0b0e39331 --- /dev/null +++ b/benchmarks/bench/scan/exclusive/by_key.cu @@ -0,0 +1,129 @@ +#include + +// %RANGE% TUNE_ITEMS ipt 7:24:1 +// %RANGE% TUNE_THREADS tpb 128:1024:32 +// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 + +#include + +#include + +#if !TUNE_BASE +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + using ScanByKeyPolicyT = AgentScanByKeyPolicy; + }; + + using MaxPolicy = policy_t; +}; +#endif + +template +static void scan(nvbench::state &state, nvbench::type_list) +{ + using init_value_t = ValueT; + using op_t = cub::Sum; + using accum_t = cub::detail::accumulator_t; + using key_input_it_t = const KeyT *; + using val_input_it_t = const ValueT *; + using val_output_it_t = ValueT *; + using equality_op_t = cub::Equality; + using offset_t = OffsetT; + +#if !TUNE_BASE + using policy_t = policy_hub_t; + using dispatch_t = cub::DispatchScanByKey; +#else + using dispatch_t = cub::DispatchScanByKey; +#endif + + const auto elements = static_cast(state.get_int64("Elements{io}")); + + thrust::device_vector keys(elements); + thrust::device_vector in_vals(elements); + thrust::device_vector out_vals(elements); + + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + gen(seed_t{}, keys, entropy); + + KeyT *d_keys = thrust::raw_pointer_cast(keys.data()); + ValueT *d_in_vals = thrust::raw_pointer_cast(in_vals.data()); + ValueT *d_out_vals = thrust::raw_pointer_cast(out_vals.data()); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + + size_t tmp_size; + dispatch_t::Dispatch(nullptr, + tmp_size, + d_keys, + d_in_vals, + d_out_vals, + equality_op_t{}, + op_t{}, + init_value_t{}, + static_cast(elements), + 0 /* stream */); + + thrust::device_vector tmp(tmp_size); + nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data()); + + state.exec([&](nvbench::launch &launch) { + dispatch_t::Dispatch(nullptr, + tmp_size, + d_keys, + d_in_vals, + d_out_vals, + equality_op_t{}, + op_t{}, + init_value_t{}, + static_cast(elements), + launch.get_stream()); + }); +} + +using some_offset_types = nvbench::type_list; + +#ifdef TUNE_KeyT +using key_types = nvbench::type_list; +#else +using key_types = all_types; +#endif + +#ifdef TUNE_ValueT +using value_types = nvbench::type_list; +#else +using value_types = nvbench::type_list; +#endif + +NVBENCH_BENCH_TYPES(scan, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types)) + .set_name("cub::DeviceScan::ExclusiveSumByKey") + .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});