forked from NVIDIA/cub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a68fab7
commit 2a8c408
Showing
3 changed files
with
152 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
#include <nvbench_helper.cuh> | ||
|
||
// %RANGE% TUNE_ITEMS ipt 7:24:1 | ||
// %RANGE% TUNE_THREADS tpb 128:1024:32 | ||
// %RANGE% TUNE_TRANSPOSE trp 0:1:1 | ||
// %RANGE% TUNE_LOAD ld 0:1:1 | ||
// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 | ||
// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 | ||
|
||
#if !TUNE_BASE | ||
#if TUNE_TRANSPOSE == 0 | ||
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT | ||
#else // TUNE_TRANSPOSE == 1 | ||
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE | ||
#endif // TUNE_TRANSPOSE | ||
|
||
#if TUNE_LOAD == 0 | ||
#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT | ||
#else // TUNE_LOAD == 1 | ||
#define TUNE_LOAD_MODIFIER cub::LOAD_CA | ||
#endif // TUNE_LOAD | ||
|
||
struct device_reduce_by_key_policy_hub | ||
{ | ||
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> | ||
{ | ||
using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy<TUNE_THREADS, | ||
TUNE_ITEMS, | ||
TUNE_LOAD_ALGORITHM, | ||
TUNE_LOAD_MODIFIER, | ||
cub::BLOCK_SCAN_WARP_SCANS>; | ||
}; | ||
|
||
using MaxPolicy = Policy350; | ||
}; | ||
#endif // !TUNE_BASE | ||
|
||
#include <cub/device/device_run_length_encode.cuh> | ||
|
||
template <class T, class OffsetT> | ||
static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>) | ||
{ | ||
using offset_t = OffsetT; | ||
using keys_input_it_t = const T*; | ||
using unique_output_it_t = T*; | ||
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>; | ||
using aggregate_output_it_t = offset_t*; | ||
using num_runs_output_iterator_t = offset_t*; | ||
using equality_op_t = cub::Equality; | ||
using reduction_op_t = cub::Sum; | ||
using accum_t = offset_t; | ||
|
||
#if !TUNE_BASE | ||
using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t, | ||
unique_output_it_t, | ||
vals_input_it_t, | ||
aggregate_output_it_t, | ||
num_runs_output_iterator_t, | ||
equality_op_t, | ||
reduction_op_t, | ||
offset_t, | ||
accum_t, | ||
device_reduce_by_key_policy_hub>; | ||
#else | ||
using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t, | ||
unique_output_it_t, | ||
vals_input_it_t, | ||
aggregate_output_it_t, | ||
num_runs_output_iterator_t, | ||
equality_op_t, | ||
reduction_op_t, | ||
offset_t, | ||
accum_t>; | ||
#endif | ||
|
||
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
const std::size_t min_segment_size = 1; | ||
const std::size_t max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegSize")); | ||
|
||
thrust::device_vector<offset_t> num_runs_out(1); | ||
thrust::device_vector<offset_t> out_vals(elements); | ||
thrust::device_vector<T> out_keys(elements); | ||
thrust::device_vector<T> in_keys = | ||
gen_uniform_key_segments<T>(seed_t{}, elements, min_segment_size, max_segment_size); | ||
|
||
T *d_in_keys = thrust::raw_pointer_cast(in_keys.data()); | ||
T *d_out_keys = thrust::raw_pointer_cast(out_keys.data()); | ||
offset_t *d_out_vals = thrust::raw_pointer_cast(out_vals.data()); | ||
offset_t *d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); | ||
vals_input_it_t d_in_vals(offset_t{1}); | ||
|
||
std::uint8_t *d_temp_storage{}; | ||
std::size_t temp_storage_bytes{}; | ||
|
||
dispatch_t::Dispatch(d_temp_storage, | ||
temp_storage_bytes, | ||
d_in_keys, | ||
d_out_keys, | ||
d_in_vals, | ||
d_out_vals, | ||
d_num_runs_out, | ||
equality_op_t{}, | ||
reduction_op_t{}, | ||
elements, | ||
0); | ||
|
||
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); | ||
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); | ||
|
||
dispatch_t::Dispatch(d_temp_storage, | ||
temp_storage_bytes, | ||
d_in_keys, | ||
d_out_keys, | ||
d_in_vals, | ||
d_out_vals, | ||
d_num_runs_out, | ||
equality_op_t{}, | ||
reduction_op_t{}, | ||
elements, | ||
0); | ||
cudaDeviceSynchronize(); | ||
const OffsetT num_runs = num_runs_out[0]; | ||
|
||
state.add_element_count(elements); | ||
state.add_global_memory_reads<T>(elements); | ||
state.add_global_memory_writes<T>(num_runs); | ||
state.add_global_memory_writes<OffsetT>(num_runs); | ||
state.add_global_memory_writes<OffsetT>(1); | ||
|
||
state.exec([&](nvbench::launch &launch) { | ||
dispatch_t::Dispatch(d_temp_storage, | ||
temp_storage_bytes, | ||
d_in_keys, | ||
d_out_keys, | ||
d_in_vals, | ||
d_out_vals, | ||
d_num_runs_out, | ||
equality_op_t{}, | ||
reduction_op_t{}, | ||
elements, | ||
launch.get_stream()); | ||
}); | ||
} | ||
|
||
using some_offset_types = nvbench::type_list<nvbench::int32_t>; | ||
|
||
NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types)) | ||
.set_name("cub::DeviceRunLengthEncode::Encode") | ||
.set_type_axes_names({"T{ct}", "OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) | ||
.add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters