Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add thrust::inclusive_scan with init_value support #1940

Merged
merged 11 commits into from
Aug 28, 2024
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#include <cub/device/device_scan.cuh>

#include <cuda/std/__functional/invoke.h>

#include <look_back_helper.cuh>

#if !TUNE_BASE
Expand Down Expand Up @@ -85,7 +87,7 @@ template <typename T, typename OffsetT>
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = cub::detail::InputValue<T>;
using accum_t = cub::detail::accumulator_t<op_t, T, T>;
using accum_t = ::cuda::std::__accumulator_t<op_t, T, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = OffsetT;
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
{
using init_value_t = ValueT;
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, init_value_t, ValueT>;
using accum_t = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
using key_input_it_t = const KeyT*;
using val_input_it_t = const ValueT*;
using val_output_it_t = ValueT*;
Expand Down
4 changes: 0 additions & 4 deletions cub/cub/detail/type_traits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,5 @@ using invoke_result_t =
::cuda::std::invoke_result_t<Invokable, Args...>;
#endif

/// The type of intermediate accumulator (according to P2322R6)
template <typename Invokable, typename InitT, typename InputT>
using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;

} // namespace detail
CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

#include <cub/config.cuh>

#include <cuda/std/__functional/invoke.h>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down Expand Up @@ -200,7 +202,7 @@ struct DeviceRunLengthEncode
// Generator type for providing 1s values for run-length reduction
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;

using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;
using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;

using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

Expand Down
4 changes: 3 additions & 1 deletion cub/cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>

#include <cuda/std/__functional/invoke.h>

CUB_NAMESPACE_BEGIN

//! @rst
Expand Down Expand Up @@ -1303,7 +1305,7 @@ struct DeviceScan

// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using AccumT = cub::detail::accumulator_t<ScanOpT, InitValueT, cub::detail::value_t<InputIteratorT>>;
using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
constexpr bool ForceInclusive = true;

return DispatchScan<
Expand Down
25 changes: 12 additions & 13 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ template <typename InputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename TransformOpT = ::cuda::std::__identity>
struct DispatchReduce : SelectedPolicy
Expand Down Expand Up @@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy
* @tparam InitT
* Initial value type
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename TransformOpT,
typename InitT,
typename AccumT =
detail::accumulator_t<ReductionOpT, //
InitT,
cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
template <
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename TransformOpT,
typename InitT,
typename AccumT = ::cuda::std::
__accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
using DispatchTransformReduce =
DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, SelectedPolicyT, TransformOpT>;

Expand Down Expand Up @@ -850,7 +849,7 @@ template <typename InputIteratorT,
typename OffsetT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
struct DispatchSegmentedReduce : SelectedPolicy
{
Expand Down
38 changes: 19 additions & 19 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
* Implementation detail, do not specify directly, requirements on the
* content of this type are subject to breaking change.
*/
template <
typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT,
typename AccumT = //
detail::
accumulator_t<ReductionOpT, cub::detail::value_t<ValuesInputIteratorT>, cub::detail::value_t<ValuesInputIteratorT>>,
typename SelectedPolicy = //
detail::device_reduce_by_key_policy_hub< //
ReductionOpT, //
AccumT, //
cub::detail::non_void_value_t< //
UniqueOutputIteratorT, //
cub::detail::value_t<KeysInputIteratorT>>>>
template <typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT,
typename AccumT = //
::cuda::std::__accumulator_t<ReductionOpT,
cub::detail::value_t<ValuesInputIteratorT>,
cub::detail::value_t<ValuesInputIteratorT>>,
typename SelectedPolicy = //
detail::device_reduce_by_key_policy_hub< //
ReductionOpT, //
AccumT, //
cub::detail::non_void_value_t< //
UniqueOutputIteratorT, //
cub::detail::value_t<KeysInputIteratorT>>>>
struct DispatchReduceByKey
{
//-------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,11 @@ template <typename InputIteratorT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = detail::accumulator_t<ScanOpT,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>,
cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>,
bool ForceInclusive = false>
struct DispatchScan : SelectedPolicy
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,10 @@ template <
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = detail::accumulator_t<
typename AccumT = ::cuda::std::__accumulator_t<
ScanOpT,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
cub::detail::value_t<ValuesInputIteratorT>>,
cub::detail::value_t<ValuesInputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>>,
typename SelectedPolicy =
DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
struct DispatchScanByKey : SelectedPolicy
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/thread/thread_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
{
Expand Down Expand Up @@ -110,7 +110,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
{
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
Expand Down Expand Up @@ -170,7 +170,7 @@ template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix)
{
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
Expand Down
4 changes: 2 additions & 2 deletions cub/test/catch2_test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
output_t expected_result =
static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));

Expand All @@ -152,7 +152,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
SECTION("sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;

// Prepare verification data
output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
c2h::host_vector<output_t> expected_result(num_segments);
compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_by_key_iterators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]"
using op_t = cub::Sum;

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
c2h::host_vector<output_t> expected_result(num_segments);
compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
Expand Down
2 changes: 1 addition & 1 deletion cub/test/catch2_test_device_reduce_iterators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i
auto reduction_op = op_t{};

// Prepare verification data
using accum_t = cub::detail::accumulator_t<op_t, init_t, item_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{});

// Run test
Expand Down
12 changes: 6 additions & 6 deletions cub/test/catch2_test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive sum")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive scan")
{
using op_t = cub::Min;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Prepare verification data
c2h::host_vector<input_t> host_items(in_items);
Expand Down Expand Up @@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("inclusive scan with init value")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down Expand Up @@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive scan")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down Expand Up @@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
SECTION("exclusive scan with future-init value")
{
using op_t = cub::Sum;
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;

// Scan operator
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
Expand Down
8 changes: 4 additions & 4 deletions cub/test/catch2_test_device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ template <typename InputIt, typename OutputIt, typename InitT, typename BinaryOp
void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op)
{
using value_t = cub::detail::value_t<InputIt>;
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
using output_t = cub::detail::value_t<OutputIt>;
accum_t acc = static_cast<accum_t>(init);
for (; first != last; ++first)
Expand All @@ -75,7 +75,7 @@ template <typename InputIt, typename OutputIt, typename BinaryOp, typename InitT
void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init)
{
using value_t = cub::detail::value_t<InputIt>;
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
using output_t = cub::detail::value_t<OutputIt>;
accum_t acc = static_cast<accum_t>(init);
for (; first != last; ++first)
Expand All @@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference(
std::size_t num_items)
{
using value_t = cub::detail::value_t<ValueInItT>;
using accum_t = cub::detail::accumulator_t<ScanOpT, InitT, value_t>;
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, InitT>;
using output_t = cub::detail::value_t<ValuesOutItT>;

if (num_items > 0)
Expand Down Expand Up @@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference(
std::size_t num_items)
{
using value_t = cub::detail::value_t<ValueInItT>;
using accum_t = cub::detail::accumulator_t<ScanOpT, value_t, value_t>;
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, value_t>;
using output_t = cub::detail::value_t<ValuesOutItT>;

for (std::size_t i = 0; i < num_items;)
Expand Down
Loading
Loading