diff --git a/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/benchmarks/bench/run_length_encode/non_trivial_runs.cu new file mode 100644 index 000000000..993e7a775 --- /dev/null +++ b/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -0,0 +1,138 @@ +#include + +// %RANGE% TUNE_ITEMS ipt 7:24:1 +// %RANGE% TUNE_THREADS tpb 128:1024:32 +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_TIME_SLICING ts 0:1:1 +// %RANGE% TUNE_LOAD ld 0:1:1 +// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 + +#include + +#if !TUNE_BASE +#if TUNE_TRANSPOSE == 0 +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +#else // TUNE_TRANSPOSE == 1 +#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +#endif // TUNE_TRANSPOSE + +#if TUNE_LOAD == 0 +#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +#else // TUNE_LOAD == 1 +#define TUNE_LOAD_MODIFIER cub::LOAD_CA +#endif // TUNE_LOAD + +struct device_rle_policy_hub +{ + struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + { + using RleSweepPolicyT = cub::AgentRlePolicy; + }; + + using MaxPolicy = Policy350; +}; +#endif // !TUNE_BASE + +template +static void rle(nvbench::state &state, nvbench::type_list) +{ + using offset_t = OffsetT; + using keys_input_it_t = const T*; + using offset_output_it_t = offset_t*; + using length_output_it_t = offset_t*; + using num_runs_output_iterator_t = offset_t*; + using equality_op_t = cub::Equality; + using accum_t = offset_t; + + #if !TUNE_BASE + using dispatch_t = cub::DeviceRleDispatch; + #else + using dispatch_t = cub::DeviceRleDispatch; + #endif + + const auto elements = static_cast(state.get_int64("Elements{io}")); + const std::size_t min_segment_size = 1; + const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); + + thrust::device_vector num_runs_out(1); + thrust::device_vector out_offsets(elements); + thrust::device_vector out_lengths(elements); + thrust::device_vector in_keys = + gen_uniform_key_segments(seed_t{}, elements, min_segment_size, max_segment_size); + + T *d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + offset_t *d_out_offsets = thrust::raw_pointer_cast(out_offsets.data()); + offset_t *d_out_lengths = thrust::raw_pointer_cast(out_lengths.data()); + offset_t *d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); + + std::uint8_t *d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + dispatch_t::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_offsets, + d_out_lengths, + d_num_runs_out, + equality_op_t{}, + elements, + 0); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + dispatch_t::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_offsets, + d_out_lengths, + d_num_runs_out, + equality_op_t{}, + elements, + 0); + cudaDeviceSynchronize(); + const OffsetT num_runs = num_runs_out[0]; + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(num_runs); + state.add_global_memory_writes(num_runs); + state.add_global_memory_writes(1); + + state.exec([&](nvbench::launch &launch) { + dispatch_t::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_offsets, + d_out_lengths, + d_num_runs_out, + equality_op_t{}, + elements, + launch.get_stream()); + }); +} + +using some_offset_types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types)) + .set_name("cub::DeviceRunLengthEncode::Encode") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});