Skip to content

Commit

Permalink
Encode bench
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed May 17, 2023
1 parent a68fab7 commit 2a8c408
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 5 deletions.
4 changes: 1 addition & 3 deletions benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include "thrust/device_vector.h"
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
Expand Down Expand Up @@ -127,10 +126,9 @@ static void reduce(nvbench::state &state, nvbench::type_list<KeyT, ValueT, Offse
state.add_global_memory_reads<KeyT>(elements);
state.add_global_memory_reads<ValueT>(elements);
state.add_global_memory_writes<ValueT>(num_runs);
state.add_global_memory_writes<KeyT>(num_runs);
state.add_global_memory_writes<OffsetT>(1);

std::cout << "runs: " << num_runs << std::endl;

state.exec([&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
Expand Down
151 changes: 151 additions & 0 deletions benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_TRANSPOSE trp 0:1:1
// %RANGE% TUNE_LOAD ld 0:1:1
// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5
// %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5

#if !TUNE_BASE
#if TUNE_TRANSPOSE == 0
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
#else // TUNE_TRANSPOSE == 1
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
#endif // TUNE_TRANSPOSE

#if TUNE_LOAD == 0
#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
#else // TUNE_LOAD == 1
#define TUNE_LOAD_MODIFIER cub::LOAD_CA
#endif // TUNE_LOAD

struct device_reduce_by_key_policy_hub
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
{
using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy<TUNE_THREADS,
TUNE_ITEMS,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS>;
};

using MaxPolicy = Policy350;
};
#endif // !TUNE_BASE

#include <cub/device/device_run_length_encode.cuh>

template <class T, class OffsetT>
static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
{
using offset_t = OffsetT;
using keys_input_it_t = const T*;
using unique_output_it_t = T*;
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
using aggregate_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = cub::Equality;
using reduction_op_t = cub::Sum;
using accum_t = offset_t;

#if !TUNE_BASE
using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
unique_output_it_t,
vals_input_it_t,
aggregate_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t,
device_reduce_by_key_policy_hub>;
#else
using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
unique_output_it_t,
vals_input_it_t,
aggregate_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const std::size_t min_segment_size = 1;
const std::size_t max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegSize"));

thrust::device_vector<offset_t> num_runs_out(1);
thrust::device_vector<offset_t> out_vals(elements);
thrust::device_vector<T> out_keys(elements);
thrust::device_vector<T> in_keys =
gen_uniform_key_segments<T>(seed_t{}, elements, min_segment_size, max_segment_size);

T *d_in_keys = thrust::raw_pointer_cast(in_keys.data());
T *d_out_keys = thrust::raw_pointer_cast(out_keys.data());
offset_t *d_out_vals = thrust::raw_pointer_cast(out_vals.data());
offset_t *d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
vals_input_it_t d_in_vals(offset_t{1});

std::uint8_t *d_temp_storage{};
std::size_t temp_storage_bytes{};

dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);
cudaDeviceSynchronize();
const OffsetT num_runs = num_runs_out[0];

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(num_runs);
state.add_global_memory_writes<OffsetT>(num_runs);
state.add_global_memory_writes<OffsetT>(1);

state.exec([&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
launch.get_stream());
});
}

using some_offset_types = nvbench::type_list<nvbench::int32_t>;

NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types))
.set_name("cub::DeviceRunLengthEncode::Encode")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
2 changes: 0 additions & 2 deletions cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,6 @@ struct DeviceRunLengthEncode
cudaStream_t stream = 0)
{
using OffsetT = int; // Signed integer type for global offsets
using FlagIterator = NullType *; // FlagT iterator type (not used)
using SelectOp = NullType; // Selection op (not used)
using EqualityOp = Equality; // Default == operator
using ReductionOp = cub::Sum; // Value reduction operator

Expand Down

0 comments on commit 2a8c408

Please sign in to comment.