Skip to content

Commit

Permalink
Tune RLE for SM90
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Jun 20, 2023
1 parent 81dd8c8 commit b0ae400
Show file tree
Hide file tree
Showing 4 changed files with 433 additions and 59 deletions.
4 changes: 3 additions & 1 deletion benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
accum_t,
device_reduce_by_key_policy_hub>;
#else
using policy_t = cub::detail::device_run_length_encode_policy_hub<accum_t, T>;
using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
unique_output_it_t,
vals_input_it_t,
Expand All @@ -99,7 +100,8 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
equality_op_t,
reduction_op_t,
offset_t,
accum_t>;
accum_t,
policy_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
58 changes: 34 additions & 24 deletions cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@

#pragma once

#include <iterator>
#include <stdio.h>

#include <cub/config.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/device/dispatch/dispatch_rle.cuh>
#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
#include <cub/util_deprecated.cuh>

#include <iterator>

#include <stdio.h>

CUB_NAMESPACE_BEGIN


Expand Down Expand Up @@ -204,36 +206,44 @@ struct DeviceRunLengthEncode
int num_items,
cudaStream_t stream = 0)
{
using OffsetT = int; // Signed integer type for global offsets
using EqualityOp = Equality; // Default == operator
using ReductionOp = cub::Sum; // Value reduction operator
using offset_t = int; // Signed integer type for global offsets
using equality_op = Equality; // Default == operator
using reduction_op = cub::Sum; // Value reduction operator

// The lengths output value type
using LengthT =
cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
using length_t =
cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;

// Generator type for providing 1s values for run-length reduction
using LengthsInputIteratorT = ConstantInputIterator<LengthT, OffsetT>;
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;

using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;

using key_t =
cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

using policy_t = detail::device_run_length_encode_policy_hub<accum_t, key_t>;

return DispatchReduceByKey<InputIteratorT,
UniqueOutputIteratorT,
LengthsInputIteratorT,
lengths_input_iterator_t,
LengthsOutputIteratorT,
NumRunsOutputIteratorT,
EqualityOp,
ReductionOp,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_unique_out,
LengthsInputIteratorT(
(LengthT)1),
d_counts_out,
d_num_runs_out,
EqualityOp(),
ReductionOp(),
num_items,
stream);
equality_op,
reduction_op,
offset_t,
accum_t,
policy_t>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_unique_out,
lengths_input_iterator_t((length_t)1),
d_counts_out,
d_num_runs_out,
equality_op(),
reduction_op(),
num_items,
stream);
}

template <typename InputIteratorT,
Expand Down
38 changes: 4 additions & 34 deletions cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <cub/agent/agent_rle.cuh>
#include <cub/config.cuh>
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_device.cuh>
Expand Down Expand Up @@ -150,38 +151,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
* Dispatch
******************************************************************************/

namespace detail
{

template <class T>
struct device_rle_policy_hub
{
/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
enum
{
NOMINAL_4B_ITEMS_PER_THREAD = 15,

ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};

using RleSweepPolicyT =
AgentRlePolicy<96,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
true,
BLOCK_SCAN_WARP_SCANS,
detail::default_reduce_by_key_delay_constructor_t<int, int>>;
};

using MaxPolicy = Policy350;
};

} // namespace detail

/**
* Utility class for dispatching the appropriately-tuned kernels for DeviceRle
*
Expand Down Expand Up @@ -213,8 +182,9 @@ template <typename InputIteratorT,
typename NumRunsOutputIteratorT,
typename EqualityOpT,
typename OffsetT,
typename SelectedPolicy =
detail::device_rle_policy_hub<cub::detail::value_t<InputIteratorT>>>
typename SelectedPolicy = detail::device_non_trivial_runs_policy_hub<
cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>,
cub::detail::value_t<InputIteratorT>>>
struct DeviceRleDispatch
{
/******************************************************************************
Expand Down
Loading

0 comments on commit b0ae400

Please sign in to comment.