Skip to content

Commit

Permalink
Add thrust::inclusive_scan with init_value support (#1940)
Browse files Browse the repository at this point in the history
* Add thrust::inclusive_scan with init value sequential

* Add thrust::inclusive_scan cuda par with init value

* Add thrust::async::incluisve_scan with init value

* Add thrust::inclusive_scan tbb with init value

* Handle reviews

* Consolidate init overloads into a single overload that accepts both init and binary_op

* Fix formatting issues

* Add cuda::std::accumulator_t and use it for value_type in scan algorithms

* Redo Bernhard's work and consolidate the two tbb::inclusive_scan bodies

* Handle final reviews

* Replace cub::accumulator_t with cuda::std::__accumulator_t
  • Loading branch information
gonidelis committed Aug 28, 2024
1 parent ec5bd08 commit e311e89
Show file tree
Hide file tree
Showing 31 changed files with 603 additions and 80 deletions.
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#include <cub/device/device_scan.cuh>

#include <cuda/std/__functional/invoke.h>

#include <look_back_helper.cuh>

#if !TUNE_BASE
Expand Down Expand Up @@ -85,7 +87,7 @@ template <typename T, typename OffsetT>
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = cub::detail::InputValue<T>;
using accum_t = cub::detail::accumulator_t<op_t, T, T>;
using accum_t = ::cuda::std::__accumulator_t<op_t, T, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = OffsetT;
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
{
using init_value_t = ValueT;
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, init_value_t, ValueT>;
using accum_t = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
using key_input_it_t = const KeyT*;
using val_input_it_t = const ValueT*;
using val_output_it_t = ValueT*;
Expand Down
4 changes: 0 additions & 4 deletions cub/cub/detail/type_traits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,5 @@ using invoke_result_t =
::cuda::std::invoke_result_t<Invokable, Args...>;
#endif

/// The type of intermediate accumulator (according to P2322R6)
template <typename Invokable, typename InitT, typename InputT>
using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;

} // namespace detail
CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

#include <cub/config.cuh>

#include <cuda/std/__functional/invoke.h>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down Expand Up @@ -200,7 +202,7 @@ struct DeviceRunLengthEncode
// Generator type for providing 1s values for run-length reduction
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;

using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;
using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;

using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

Expand Down
4 changes: 3 additions & 1 deletion cub/cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>

#include <cuda/std/__functional/invoke.h>

CUB_NAMESPACE_BEGIN

//! @rst
Expand Down Expand Up @@ -1303,7 +1305,7 @@ struct DeviceScan

// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using AccumT = cub::detail::accumulator_t<ScanOpT, InitValueT, cub::detail::value_t<InputIteratorT>>;
using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
constexpr bool ForceInclusive = true;

return DispatchScan<
Expand Down
25 changes: 12 additions & 13 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ template <typename InputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename TransformOpT = ::cuda::std::__identity>
struct DispatchReduce : SelectedPolicy
Expand Down Expand Up @@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy
* @tparam InitT
* Initial value type
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename TransformOpT,
typename InitT,
typename AccumT =
detail::accumulator_t<ReductionOpT, //
InitT,
cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
template <
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename TransformOpT,
typename InitT,
typename AccumT = ::cuda::std::
__accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
using DispatchTransformReduce =
DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, SelectedPolicyT, TransformOpT>;

Expand Down Expand Up @@ -850,7 +849,7 @@ template <typename InputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
struct DispatchSegmentedReduce : SelectedPolicy
{
Expand Down
38 changes: 19 additions & 19 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
* Implementation detail, do not specify directly, requirements on the
* content of this type are subject to breaking change.
*/
template <
typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT,
typename AccumT = //
detail::
accumulator_t<ReductionOpT, cub::detail::value_t<ValuesInputIteratorT>, cub::detail::value_t<ValuesInputIteratorT>>,
typename SelectedPolicy = //
detail::device_reduce_by_key_policy_hub< //
ReductionOpT, //
AccumT, //
cub::detail::non_void_value_t< //
UniqueOutputIteratorT, //
cub::detail::value_t<KeysInputIteratorT>>>>
template <typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT,
typename AccumT = //
::cuda::std::__accumulator_t<ReductionOpT,
cub::detail::value_t<ValuesInputIteratorT>,
cub::detail::value_t<ValuesInputIteratorT>>,
typename SelectedPolicy = //
detail::device_reduce_by_key_policy_hub< //
ReductionOpT, //
AccumT, //
cub::detail::non_void_value_t< //
UniqueOutputIteratorT, //
cub::detail::value_t<KeysInputIteratorT>>>>
struct DispatchReduceByKey
{
//-------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,11 @@ template <typename InputIteratorT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = detail::accumulator_t<ScanOpT,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>,
cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>,
bool ForceInclusive = false>
struct DispatchScan : SelectedPolicy
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,10 @@ template <
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = detail::accumulator_t<
typename AccumT = ::cuda::std::__accumulator_t<
ScanOpT,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
cub::detail::value_t<ValuesInputIteratorT>>,
cub::detail::value_t<ValuesInputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>>,
typename SelectedPolicy =
DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
struct DispatchScanByKey : SelectedPolicy
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/thread/thread_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
{
Expand Down Expand Up @@ -110,7 +110,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
{
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
Expand Down Expand Up @@ -170,7 +170,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix)
{
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
Expand Down
4 changes: 2 additions & 2 deletions cub/test/catch2_test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
output_t expected_result =
static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));

Expand All @@ -152,7 +152,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
SECTION("sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;

// Prepare verification data
output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
c2h::host_vector<output_t> expected_result(num_segments);
compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_by_key_iterators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]"
using op_t = cub::Sum;

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
c2h::host_vector<output_t> expected_result(num_segments);
compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_iterators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i
auto reduction_op = op_t{};

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, init_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{});

// Run test
Expand Down
12 changes: 6 additions & 6 deletions cub/test/catch2_test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive scan")
{
using op_t = cub::Min;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive scan with init value")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down Expand Up @@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive scan")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down Expand Up @@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive scan with future-init value")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down
8 changes: 4 additions & 4 deletions cub/test/catch2_test_device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ template <typename InputIt, typename OutputIt, typename InitT, typename BinaryOp
void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op)
{
using value_t = cub::detail::value_t<InputIt>;
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
using output_t = cub::detail::value_t<OutputIt>;
accum_t acc = static_cast<accum_t>(init);
for (; first != last; ++first)
Expand All @@ -75,7 +75,7 @@ template <typename InputIt, typename OutputIt, typename BinaryOp, typename InitT
void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init)
{
using value_t = cub::detail::value_t<InputIt>;
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
using output_t = cub::detail::value_t<OutputIt>;
accum_t acc = static_cast<accum_t>(init);
for (; first != last; ++first)
Expand All @@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference(
std::size_t num_items)
{
using value_t = cub::detail::value_t<ValueInItT>;
using accum_t = cub::detail::accumulator_t<ScanOpT, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, InitT>;
using output_t = cub::detail::value_t<ValuesOutItT>;

if (num_items > 0)
Expand Down Expand Up @@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference(
std::size_t num_items)
{
using value_t = cub::detail::value_t<ValueInItT>;
using accum_t = cub::detail::accumulator_t<ScanOpT, value_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, value_t>;
using output_t = cub::detail::value_t<ValuesOutItT>;

for (std::size_t i = 0; i < num_items;)
Expand Down
Loading

0 comments on commit e311e89

Please sign in to comment.