From e311e891526e0148e6737a40c1ea54215eba6e49 Mon Sep 17 00:00:00 2001 From: Giannis Gonidelis Date: Wed, 28 Aug 2024 00:36:14 -0700 Subject: [PATCH] Add thrust::inclusive_scan with init_value support (#1940) * Add thrust::inclusive_scan with init value sequential * Add thrust::inclusive_scan cuda par with init value * Add thrust::async::incluisve_scan with init value * Add thrust::inclusive_scan tbb with init value * Handle reviews * Consolidate init overloads into a single overload that accepts both init and binary_op * Fix formatting issues * Add cuda::std::accumulator_t and use it for value_type in scan algorithms * Redo Bernhard's work and consolidate the two tbb::inclusive_scan bodies * Handle final reviews * Replace cub::accumulator_t with cuda::std::__accumulator_t --- cub/benchmarks/bench/scan/exclusive/base.cuh | 4 +- cub/benchmarks/bench/scan/exclusive/by_key.cu | 2 +- cub/cub/detail/type_traits.cuh | 4 - cub/cub/device/device_run_length_encode.cuh | 4 +- cub/cub/device/device_scan.cuh | 4 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 25 ++--- .../dispatch/dispatch_reduce_by_key.cuh | 38 +++---- cub/cub/device/dispatch/dispatch_scan.cuh | 10 +- .../device/dispatch/dispatch_scan_by_key.cuh | 6 +- cub/cub/thread/thread_reduce.cuh | 6 +- cub/test/catch2_test_device_reduce.cu | 4 +- cub/test/catch2_test_device_reduce_by_key.cu | 2 +- ...ch2_test_device_reduce_by_key_iterators.cu | 2 +- .../catch2_test_device_reduce_iterators.cu | 2 +- cub/test/catch2_test_device_scan.cu | 12 +- cub/test/catch2_test_device_scan.cuh | 8 +- cub/test/catch2_test_device_scan_iterators.cu | 10 +- .../catch2_test_device_segmented_reduce.cu | 4 +- ..._test_device_segmented_reduce_iterators.cu | 2 +- .../include/cuda/std/__functional/invoke.h | 4 + thrust/testing/async/inclusive_scan/mixin.h | 18 +++ thrust/testing/async/inclusive_scan/simple.cu | 28 +++++ thrust/testing/scan.cu | 14 +++ thrust/thrust/async/scan.h | 38 +++++++ thrust/thrust/detail/scan.inl | 30 +++++ thrust/thrust/scan.h | 103 +++++++++++++++++ .../system/cuda/detail/async/inclusive_scan.h | 104 +++++++++++++++++ thrust/thrust/system/cuda/detail/scan.h | 105 +++++++++++++++++- thrust/thrust/system/detail/sequential/scan.h | 42 +++++++ thrust/thrust/system/tbb/detail/scan.h | 4 + thrust/thrust/system/tbb/detail/scan.inl | 44 +++++++- 31 files changed, 603 insertions(+), 80 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 42897931679..e3cd7a7be8e 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -27,6 +27,8 @@ #include +#include + #include #if !TUNE_BASE @@ -85,7 +87,7 @@ template static void basic(nvbench::state& state, nvbench::type_list) { using init_t = cub::detail::InputValue; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using input_it_t = const T*; using output_it_t = T*; using offset_t = OffsetT; diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 3830ad7764a..26676d66c22 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list; + using accum_t = ::cuda::std::__accumulator_t; using key_input_it_t = const KeyT*; using val_input_it_t = const ValueT*; using val_output_it_t = ValueT*; diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 10d40cacd16..ed505bb1fc0 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -62,9 +62,5 @@ using invoke_result_t = ::cuda::std::invoke_result_t; #endif -/// The type of intermediate accumulator (according to P2322R6) -template -using accumulator_t = typename ::cuda::std::decay>::type; - } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 9020f4fe055..120562a4611 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -34,6 +34,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -200,7 +202,7 @@ struct DeviceRunLengthEncode // Generator type for providing 1s values for run-length reduction using lengths_input_iterator_t = ConstantInputIterator; - using accum_t = detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using key_t = cub::detail::non_void_value_t>; diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index c9d93c935b5..27882e9ceed 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -49,6 +49,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -1303,7 +1305,7 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using AccumT = cub::detail::accumulator_t>; + using AccumT = ::cuda::std::__accumulator_t, InitValueT>; constexpr bool ForceInclusive = true; return DispatchScan< diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index e3e3844a3f4..ba5365c6181 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -316,7 +316,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy, typename TransformOpT = ::cuda::std::__identity> struct DispatchReduce : SelectedPolicy @@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy * @tparam InitT * Initial value type */ -template >>, - typename SelectedPolicyT = DeviceReducePolicy> +template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename TransformOpT, + typename InitT, + typename AccumT = ::cuda::std:: + __accumulator_t>, InitT>, + typename SelectedPolicyT = DeviceReducePolicy> using DispatchTransformReduce = DispatchReduce; @@ -850,7 +849,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 07dd492a53b..8ae232e8d1e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ -template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT, - typename AccumT = // - detail:: - accumulator_t, cub::detail::value_t>, - typename SelectedPolicy = // - detail::device_reduce_by_key_policy_hub< // - ReductionOpT, // - AccumT, // - cub::detail::non_void_value_t< // - UniqueOutputIteratorT, // - cub::detail::value_t>>> +template , + cub::detail::value_t>, + typename SelectedPolicy = // + detail::device_reduce_by_key_policy_hub< // + ReductionOpT, // + AccumT, // + cub::detail::non_void_value_t< // + UniqueOutputIteratorT, // + cub::detail::value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 56c2be96112..7d2fc4ac17d 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -234,11 +234,11 @@ template ::value, - cub::detail::value_t, - typename InitValueT::value_type>, - cub::detail::value_t>, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If::value, + cub::detail::value_t, + typename InitValueT::value_type>>, typename SelectedPolicy = DeviceScanPolicy, bool ForceInclusive = false> struct DispatchScan : SelectedPolicy diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 032554773a7..5dfffa5e77e 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -228,10 +228,10 @@ template < typename ScanOpT, typename InitValueT, typename OffsetT, - typename AccumT = detail::accumulator_t< + typename AccumT = ::cuda::std::__accumulator_t< ScanOpT, - ::cuda::std::_If::value, cub::detail::value_t, InitValueT>, - cub::detail::value_t>, + cub::detail::value_t, + ::cuda::std::_If::value, cub::detail::value_t, InitValueT>>, typename SelectedPolicy = DeviceScanByKeyPolicy, ScanOpT>> struct DispatchScanByKey : SelectedPolicy diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index db7e2427791..7ac98369254 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -68,7 +68,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) { @@ -110,7 +110,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); @@ -170,7 +170,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 1e9e08c9113..290e8d8f6a9 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -132,7 +132,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = static_cast(compute_single_problem_reference(in_items, reduction_op, accum_t{})); @@ -152,7 +152,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data output_t expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu index 39f31d5e781..88c305fd360 100644 --- a/cub/test/catch2_test_device_reduce_by_key.cu +++ b/cub/test/catch2_test_device_reduce_by_key.cu @@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu index 3637813b5f6..14b7fcde9fd 100644 --- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu +++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu @@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]" using op_t = cub::Sum; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index 7c7f74ec63a..ab1dabbbb16 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{}); // Run test diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 49c9aac39c7..736e217b0ea 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan with init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index d3644e3387a..dc5b7804e8d 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -61,7 +61,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -75,7 +75,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; if (num_items > 0) @@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; for (std::size_t i = 0; i < num_items;) diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index 576d0d3f747..a07397cc367 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -84,7 +84,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -102,7 +102,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -120,7 +120,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -139,7 +139,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -157,7 +157,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data accum_t init_value{}; diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 770b85b0194..5559e7e2e81 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -121,7 +121,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_items, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); @@ -142,7 +142,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_segments); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index 8ab495ddc59..a81559b91ed 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -93,7 +93,7 @@ CUB_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); diff --git a/libcudacxx/include/cuda/std/__functional/invoke.h b/libcudacxx/include/cuda/std/__functional/invoke.h index 0e132158f50..ee864f7a342 100644 --- a/libcudacxx/include/cuda/std/__functional/invoke.h +++ b/libcudacxx/include/cuda/std/__functional/invoke.h @@ -541,6 +541,10 @@ invoke(_Fn&& __f, _Args&&... __args) noexcept(is_nothrow_invocable_v<_Fn, _Args. #endif // _CCCL_STD_VER > 2011 +/// The type of intermediate accumulator (according to P2322R6) +template +using __accumulator_t = typename decay::type>::type; + _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___FUNCTIONAL_INVOKE_H diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h index b0490d573c7..f3538f5eb94 100644 --- a/thrust/testing/async/inclusive_scan/mixin.h +++ b/thrust/testing/async/inclusive_scan/mixin.h @@ -19,6 +19,24 @@ namespace inclusive_scan namespace mixin { +namespace postfix_args_init +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple // - initial_value with binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(ValueType{42}, AltBinaryOp{})}; + } +}; + +} // namespace postfix_args_init + //------------------------------------------------------------------------------ namespace postfix_args { diff --git a/thrust/testing/async/inclusive_scan/simple.cu b/thrust/testing/async/inclusive_scan/simple.cu index fd93120d288..4f7ffcd90bc 100644 --- a/thrust/testing/async/inclusive_scan/simple.cu +++ b/thrust/testing/async/inclusive_scan/simple.cu @@ -5,6 +5,34 @@ # include # include +template > +struct simple_init_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args_init::all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors and initial value"; + } +}; + +template +struct test_simple_init +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_init, NumericTypes); + template > diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu index 42e60b95471..e657d637a57 100644 --- a/thrust/testing/scan.cu +++ b/thrust/testing/scan.cu @@ -74,6 +74,13 @@ void TestScanSimple() ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); + // inclusive scan with init and op + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(-1), thrust::multiplies()); + result = {-1, -3, 6, 24, -120}; + ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); + ASSERT_EQUAL(input, input_copy); + ASSERT_EQUAL(output, result); + // exclusive scan with init and op iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); result = {3, 4, 7, 5, 9}; @@ -88,6 +95,13 @@ void TestScanSimple() ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); ASSERT_EQUAL(input, result); + // inplace inclusive scan with init and op + input = input_copy; + iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3), thrust::plus()); + result = {4, 7, 5, 9, 4}; + ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); + ASSERT_EQUAL(input, result); + // inplace exclusive scan with init input = input_copy; iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3)); diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h index ca66455b43b..fcbb41ad43c 100644 --- a/thrust/thrust/async/scan.h +++ b/thrust/thrust/async/scan.h @@ -112,6 +112,27 @@ struct inclusive_scan_fn final THRUST_FWD(out), thrust::plus<>{})) + template + auto operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS(async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op))) + template {})) + + template >>> + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS(async_inclusive_scan( + thrust::detail::select_system(iterator_system_t>{}, + iterator_system_t>{}), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op))) }; } // namespace inclusive_scan_detail diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl index bec8924baec..fc046e8b77c 100644 --- a/thrust/thrust/detail/scan.inl +++ b/thrust/thrust/detail/scan.inl @@ -60,6 +60,21 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); } // end inclusive_scan() +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init, + AssociativeOperator binary_op) +{ + using thrust::system::detail::generic::inclusive_scan; + return inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op); +} // end inclusive_scan() + _CCCL_EXEC_CHECK_DISABLE template _CCCL_HOST_DEVICE OutputIterator exclusive_scan( @@ -268,6 +283,21 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte return thrust::inclusive_scan(select_system(system1, system2), first, last, result, binary_op); } // end inclusive_scan() +template +OutputIterator +inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op) +{ + using thrust::system::detail::generic::select_system; + + using System1 = typename thrust::iterator_system::type; + using System2 = typename thrust::iterator_system::type; + + System1 system1; + System2 system2; + + return thrust::inclusive_scan(select_system(system1, system2), first, last, result, init, binary_op); +} // end inclusive_scan() + template OutputIterator exclusive_scan(InputIterator first, InputIterator last, OutputIterator result) { diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index 44a32ebaec8..e54265ecff8 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -263,6 +263,109 @@ template binary_op(init, \*first) is assigned to *result + * and so on. This version of \p inclusive_scan requires both an associative + * operator and an initial value \p init. When the input and + * output sequences are the same, the scan is performed in-place. + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * + * The algorithm's execution is parallelized as determined by \p exec. + * + * \param exec The execution policy to use for parallelization. + * \param first The beginning of the input sequence. + * \param last The end of the input sequence. + * \param result The beginning of the output sequence. + * \param init The initial value. + * \param binary_op The associatve operator used to 'sum' values. + * \return The end of the output sequence. + * + * \tparam DerivedPolicy The name of the derived execution policy. + * \tparam InputIterator is a model of Input + * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam + * OutputIterator is a model of Output Iterator + * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. + * \tparam T is convertible to \c OutputIterator's \c value_type. + * \tparam AssociativeOperator is a model of Binary Function and \c + * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. + * + * \pre \p first may equal \p result but the range [first, last) and the range [result, result + (last - + * first)) shall not overlap otherwise. + * + * The following code snippet demonstrates how to use \p inclusive_scan with initial value to compute an in-place + * prefix sum using the \p thrust::host execution policy for parallelization: + * + * \code + * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; + * thrust::inclusive_scan(thrust::host, data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan + * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} + * \endcode + * + * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum + */ +template +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init, + AssociativeOperator binary_op); + +/*! \p inclusive_scan computes an inclusive prefix sum operation. The + * term 'inclusive' means that each result includes the corresponding + * input operand in the partial sum. More precisely, + * binary_op(init, \*first) is assigned to *result + * and so on. This version of \p inclusive_scan requires both an associative + * operator and an initial value \p init. When the input and + * output sequences are the same, the scan is performed in-place. + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * + * \param first The beginning of the input sequence. + * \param last The end of the input sequence. + * \param result The beginning of the output sequence. + * \param init The initial value. + * \param binary_op The associatve operator used to 'sum' values. + * \return The end of the output sequence. + * + * \tparam InputIterator is a model of Input + * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam + * OutputIterator is a model of Output Iterator + * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. + * \tparam T is convertible to \c OutputIterator's \c value_type. + * \tparam AssociativeOperator is a model of Binary Function and \c + * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. + * + * \pre \p first may equal \p result but the range [first, last) and the range [result, result + (last - + * first)) shall not overlap otherwise. + * + * The following code snippet demonstrates how to use \p inclusive_scan with initial value: + * + * \code + * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; + * thrust::maximum binary_op; + * thrust::inclusive_scan(data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan + * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} + * \endcode + * + * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum + */ +template +OutputIterator +inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, AssociativeOperator binary_op); + /*! \p exclusive_scan computes an exclusive prefix sum operation. The * term 'exclusive' means that each result does not include the * corresponding input operand in the partial sum. More precisely, diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index dbcc358cba4..a91310dce0b 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -126,6 +126,93 @@ async_inclusive_scan_n(execution_policy& policy, ForwardIt first, return ev; } +template +unique_eager_event async_inclusive_scan_n( + execution_policy& policy, ForwardIt first, Size n, OutputIt out, InitialValueType init, BinaryOp op) +{ + using InputValueT = cub::detail::InputValue; + using AccumT = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; + constexpr bool ForceInclusive = true; + + using Dispatch32 = + cub::DispatchScan, + ForceInclusive>; + using Dispatch64 = + cub::DispatchScan, + ForceInclusive>; + + InputValueT init_value(init); + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, tmp_size, first, out, op, init_value, n_fixed, nullptr)); + thrust::cuda_cub::throw_on_error( + status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n(device_alloc, tmp_size); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat(std::make_tuple(std::move(content), unique_stream(nonowning, user_raw_stream)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event(std::tuple_cat( + std::make_tuple(std::move(content)), extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, tmp_size, first, out, op, init_value, n_fixed, user_raw_stream)); + thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel"); + } + + return ev; +} + } // namespace detail } // namespace cuda } // namespace system @@ -140,6 +227,23 @@ auto async_inclusive_scan( THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n( policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(op))) + // ADL entry point. + template + auto async_inclusive_scan( + execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) + THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n( + policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op))) + } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index dab507f00f0..862eb7bf6cf 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -109,6 +109,81 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( return result + num_items; } +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + Size num_items, + OutputIt result, + InitValueT init, + ScanOp scan_op) +{ + using InputValueT = cub::detail::InputValue; + using AccumT = typename ::cuda::std::__accumulator_t, InitValueT>; + constexpr bool ForceInclusive = true; + + using Dispatch32 = + cub::DispatchScan, + ForceInclusive>; + using Dispatch64 = + cub::DispatchScan, + ForceInclusive>; + + cudaStream_t stream = thrust::cuda_cub::stream(policy); + cudaError_t status; + + // Negative number of items are normalized to `0` + if (thrust::detail::is_negative(num_items)) + { + num_items = 0; + } + + // Determine temporary storage requirements: + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (nullptr, tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream)); + thrust::cuda_cub::throw_on_error( + status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Run scan: + { + // Allocate temporary storage: + thrust::detail::temporary_array tmp{policy, tmp_size}; + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (tmp.data().get(), tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream)); + thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel"); + thrust::cuda_cub::throw_on_error( + thrust::cuda_cub::synchronize_optional(policy), "inclusive_scan failed to synchronize"); + } + + return result + num_items; +} + _CCCL_EXEC_CHECK_DISABLE template _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( @@ -171,7 +246,21 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( // Thrust API entry points //------------------------- -_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan_n( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + Size num_items, + OutputIt result, + T init, + ScanOp scan_op) +{ + THRUST_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, first, num_items, result, init, scan_op);), + (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), first, first + num_items, result, scan_op);)); + return result; +} + template _CCCL_HOST_DEVICE OutputIt inclusive_scan_n( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) @@ -191,6 +280,20 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan( return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op); } +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + InputIt last, + OutputIt result, + T init, + ScanOp scan_op) +{ + using diff_t = typename thrust::iterator_traits::difference_type; + diff_t const num_items = thrust::distance(first, last); + return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, init, scan_op); +} + template _CCCL_HOST_DEVICE OutputIt inclusive_scan(thrust::cuda_cub::execution_policy& policy, InputIt first, InputIt last, OutputIt result) diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index b76074c8295..c883dac6543 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -35,6 +35,8 @@ #include #include +#include + THRUST_NAMESPACE_BEGIN namespace system { @@ -75,6 +77,46 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return result; } +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + sequential::execution_policy&, + InputIterator first, + InputIterator last, + OutputIterator result, + InitialValueType init, + BinaryFunction binary_op) +{ + using namespace thrust::detail; + + using ValueType = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; + + // wrap binary_op + thrust::detail::wrapped_function wrapped_binary_op{binary_op}; + + if (first != last) + { + ValueType sum = wrapped_binary_op(init, *first); + *result = sum; + ++first; + ++result; + + while (first != last) + { + *result = sum = wrapped_binary_op(sum, *first); + ++first; + ++result; + } + } + + return result; +} + _CCCL_EXEC_CHECK_DISABLE template +OutputIterator +inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op); + template OutputIterator exclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op); diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index 5372eda9f6b..683ed226025 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -33,6 +33,8 @@ #include #include +#include + #include #include @@ -46,7 +48,7 @@ namespace detail namespace scan_detail { -template +template struct inclusive_body { InputIterator input; @@ -55,11 +57,11 @@ struct inclusive_body ValueType sum; bool first_call; - inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy) + inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) : input(input) , output(output) , binary_op{binary_op} - , sum(dummy) + , sum(init) , first_call(true) {} @@ -105,7 +107,14 @@ struct inclusive_body if (first_call) { - *iter2 = sum = *iter1; + _CCCL_IF_CONSTEXPR (HasInit) + { + *iter2 = sum = binary_op(sum, *iter1); + } + else + { + *iter2 = sum = *iter1; + } ++iter1; ++iter2; for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2) @@ -239,7 +248,7 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu if (n != 0) { - using Body = typename scan_detail::inclusive_body; + using Body = typename scan_detail::inclusive_body; Body scan_body(first, result, binary_op, *first); ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); } @@ -249,6 +258,31 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu return result; } +template +OutputIterator inclusive_scan( + tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op) +{ + using namespace thrust::detail; + + // Use the input iterator's value type and the initial value type per wg21.link/p2322 + using ValueType = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; + + using Size = typename thrust::iterator_difference::type; + Size n = thrust::distance(first, last); + + if (n != 0) + { + using Body = typename scan_detail::inclusive_body; + Body scan_body(first, result, binary_op, init); + ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); + } + + thrust::advance(result, n); + + return result; +} + template OutputIterator exclusive_scan( tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op)