Skip to content

Commit

Permalink
Add scan by key
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed May 6, 2023
1 parent 6aab5c9 commit c4ed14d
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 9 deletions.
10 changes: 5 additions & 5 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,20 @@
- partition
- flagged
- if
- scan
- by key

### TODO

- adjacent difference
- left
- right
- histogram
- histogram : needs policy
- even
- range
- rle
- rle : needs policy
- encode
- non trivial runs
- reduce
- by key
- scan
- reduce : needs policy
- by key
- segmented
6 changes: 2 additions & 4 deletions benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ constexpr std::size_t max_temp_storage_size()
using input_it_t = const T *;
using output_it_t = T *;
using offset_t = OffsetT;
using output_t = T;
using init_t = cub::detail::InputValue<T>;
using policy_t = typename policy_hub_t<accum_t>::policy_t;
using real_init_t = typename init_t::value_type;
Expand Down Expand Up @@ -61,12 +60,11 @@ static void basic(std::integral_constant<bool, true>,
nvbench::state &state,
nvbench::type_list<T, OffsetT>)
{
using accum_t = T;
using init_t = T;
using accum_t = cub::detail::accumulator_t<op_t, init_t, T>;
using input_it_t = const T *;
using output_it_t = T *;
using offset_t = OffsetT;
using output_t = T;
using init_t = cub::detail::InputValue<T>;

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t>;
Expand Down
129 changes: 129 additions & 0 deletions benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5
// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5

#include <cub/device/device_scan.cuh>

#include <type_traits>

#if !TUNE_BASE
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
{
using ScanByKeyPolicyT = AgentScanByKeyPolicy<TUNE_THREADS,
TUNE_ITEMS,
// TODO Tune
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_CA,
cub::BLOCK_SCAN_WARP_SCANS,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

using MaxPolicy = policy_t;
};
#endif

template <typename KeyT, typename ValueT, typename OffsetT>
static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT>)
{
using init_value_t = ValueT;
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, init_value_t, ValueT>;
using key_input_it_t = const KeyT *;
using val_input_it_t = const ValueT *;
using val_output_it_t = ValueT *;
using equality_op_t = cub::Equality;
using offset_t = OffsetT;

#if !TUNE_BASE
using policy_t = policy_hub_t;
using dispatch_t = cub::DispatchScanByKey<key_input_it_t,
val_input_it_t,
val_output_it_t,
equality_op_t,
op_t,
init_value_t,
offset_t,
accum_t,
policy_t>;
#else
using dispatch_t = cub::DispatchScanByKey<key_input_it_t,
val_input_it_t,
val_output_it_t,
equality_op_t,
op_t,
init_value_t,
offset_t,
accum_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));

thrust::device_vector<KeyT> keys(elements);
thrust::device_vector<ValueT> in_vals(elements);
thrust::device_vector<ValueT> out_vals(elements);

const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
gen(seed_t{}, keys, entropy);

KeyT *d_keys = thrust::raw_pointer_cast(keys.data());
ValueT *d_in_vals = thrust::raw_pointer_cast(in_vals.data());
ValueT *d_out_vals = thrust::raw_pointer_cast(out_vals.data());

state.add_element_count(elements);
state.add_global_memory_reads<KeyT>(elements);
state.add_global_memory_reads<ValueT>(elements);
state.add_global_memory_writes<ValueT>(elements);

size_t tmp_size;
dispatch_t::Dispatch(nullptr,
tmp_size,
d_keys,
d_in_vals,
d_out_vals,
equality_op_t{},
op_t{},
init_value_t{},
static_cast<int>(elements),
0 /* stream */);

thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
dispatch_t::Dispatch(nullptr,
tmp_size,
d_keys,
d_in_vals,
d_out_vals,
equality_op_t{},
op_t{},
init_value_t{},
static_cast<int>(elements),
launch.get_stream());
});
}

using some_offset_types = nvbench::type_list<nvbench::int32_t>;

#ifdef TUNE_KeyT
using key_types = nvbench::type_list<TUNE_KeyT>;
#else
using key_types = all_types;
#endif

#ifdef TUNE_ValueT
using value_types = nvbench::type_list<TUNE_ValueT>;
#else
using value_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, int128_t>;
#endif

NVBENCH_BENCH_TYPES(scan, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types))
.set_name("cub::DeviceScan::ExclusiveSumByKey")
.set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});

0 comments on commit c4ed14d

Please sign in to comment.