From 0112c166197dcad7560a93a4d1038e75ac9eecf4 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 28 Jun 2024 12:36:48 -0700 Subject: [PATCH 01/11] Add thrust::inclusive_scan with init value sequential --- thrust/testing/scan.cu | 33 ++++++++++ thrust/thrust/detail/scan.inl | 64 ++++++++++++++++++- thrust/thrust/scan.h | 47 ++++++++++---- thrust/thrust/system/detail/sequential/scan.h | 37 +++++++++++ 4 files changed, 169 insertions(+), 12 deletions(-) diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu index 42e60b95471..c02c9001b0b 100644 --- a/thrust/testing/scan.cu +++ b/thrust/testing/scan.cu @@ -67,6 +67,17 @@ void TestScanSimple() ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); + // inclusive scan with init + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3)); + result[0] = 4; + result[1] = 7; + result[2] = 5; + result[3] = 9; + result[4] = 4; + ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); + ASSERT_EQUAL(input, input_copy); + ASSERT_EQUAL(output, result); + // inclusive scan with op iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus()); result = {1, 4, 2, 6, 1}; @@ -74,6 +85,17 @@ void TestScanSimple() ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); + // inclusive scan with init and op + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); + result[0] = 4; + result[1] = 7; + result[2] = 5; + result[3] = 9; + result[4] = 4; + ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); + ASSERT_EQUAL(input, input_copy); + ASSERT_EQUAL(output, result); + // exclusive scan with init and op iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); result = {3, 4, 7, 5, 9}; @@ -88,6 +110,17 @@ void TestScanSimple() ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); ASSERT_EQUAL(input, result); + // inplace inclusive scan with init + input = input_copy; + iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3)); + result[0] = 4; + result[1] = 7; + result[2] = 5; + result[3] = 9; + result[4] = 4; + ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); + ASSERT_EQUAL(input, result); + // inplace exclusive scan with init input = input_copy; iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3)); diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl index bec8924baec..93d0c0f5b93 100644 --- a/thrust/thrust/detail/scan.inl +++ b/thrust/thrust/detail/scan.inl @@ -33,6 +33,23 @@ #include #include +template +struct is_callable_with_input +{ +private: + using value_type = typename std::iterator_traits::value_type; + + template + static auto test(int) + -> decltype(std::declval()(std::declval(), std::declval()), std::true_type()); + + template + static auto test(...) -> std::false_type; + +public: + static constexpr bool value = decltype(test(0))::value; +}; + THRUST_NAMESPACE_BEGIN _CCCL_EXEC_CHECK_DISABLE @@ -49,15 +66,45 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( _CCCL_EXEC_CHECK_DISABLE template +_CCCL_HOST_DEVICE + typename std::enable_if::value, OutputIterator>::type + inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op) +{ + using thrust::system::detail::generic::inclusive_scan; + return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); +} // end inclusive_scan() + +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE typename std::enable_if::value, OutputIterator>::type +inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init) +{ + using thrust::system::detail::generic::inclusive_scan; + return inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, thrust::plus<>()); +} // end inclusive_scan() + +_CCCL_EXEC_CHECK_DISABLE +template _CCCL_HOST_DEVICE OutputIterator inclusive_scan( const thrust::detail::execution_policy_base& exec, InputIterator first, InputIterator last, OutputIterator result, + T init, AssociativeOperator binary_op) { using thrust::system::detail::generic::inclusive_scan; - return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); + return inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op); } // end inclusive_scan() _CCCL_EXEC_CHECK_DISABLE @@ -268,6 +315,21 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte return thrust::inclusive_scan(select_system(system1, system2), first, last, result, binary_op); } // end inclusive_scan() +template +OutputIterator +inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op) +{ + using thrust::system::detail::generic::select_system; + + typedef typename thrust::iterator_system::type System1; + typedef typename thrust::iterator_system::type System2; + + System1 system1; + System2 system2; + + return thrust::inclusive_scan(select_system(system1, system2), first, last, result, init, binary_op); +} // end inclusive_scan() + template OutputIterator exclusive_scan(InputIterator first, InputIterator last, OutputIterator result) { diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index 44a32ebaec8..0a50b6e996d 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -204,13 +204,40 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte * * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum */ + +template +struct is_callable_with_input +{ +private: + using value_type = typename std::iterator_traits::value_type; + + template + static auto test(int) + -> decltype(std::declval()(std::declval(), std::declval()), std::true_type()); + + template + static auto test(...) -> std::false_type; + +public: + static constexpr bool value = decltype(test(0))::value; +}; + template -_CCCL_HOST_DEVICE OutputIterator inclusive_scan( - const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op); +_CCCL_HOST_DEVICE + typename std::enable_if::value, OutputIterator>::type + inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op); + +template +_CCCL_HOST_DEVICE typename std::enable_if::value, OutputIterator>::type +inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init); /*! \p inclusive_scan computes an inclusive prefix sum operation. The * term 'inclusive' means that each result includes the corresponding @@ -234,11 +261,9 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( * * \tparam InputIterator is a model of Input * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam - * OutputIterator is a model of Output Iterator - * and \c OutputIterator's \c value_type is convertible to - * both \c AssociativeOperator's \c first_argument_type and - * \c second_argument_type. - * \tparam AssociativeOperator is a model of Output + * Iterator and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c + * first_argument_type and \c second_argument_type. \tparam AssociativeOperator is a model of Binary Function and \c * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. * diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index b76074c8295..6376710392d 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -75,6 +75,43 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return result; } +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + sequential::execution_policy&, + InputIterator first, + InputIterator last, + OutputIterator result, + InitialValueType init, + BinaryFunction binary_op) +{ + using namespace thrust::detail; + + // Use the input iterator's value type per https://wg21.link/P0571 + using ValueType = InitialValueType; + + // wrap binary_op + thrust::detail::wrapped_function wrapped_binary_op(binary_op); + + if (first != last) + { + ValueType sum = wrapped_binary_op(*first, init); + + *result = sum; + + for (++first, ++result; first != last; ++first, ++result) + { + *result = sum = wrapped_binary_op(sum, *first); + } + } + + return result; +} + _CCCL_EXEC_CHECK_DISABLE template Date: Fri, 28 Jun 2024 12:37:33 -0700 Subject: [PATCH 02/11] Add thrust::inclusive_scan cuda par with init value --- .../system/cuda/detail/async/inclusive_scan.h | 103 ++++++++++++++++++ thrust/thrust/system/cuda/detail/scan.h | 99 +++++++++++++++++ 2 files changed, 202 insertions(+) diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index dbcc358cba4..79bdc060e57 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -126,6 +126,92 @@ async_inclusive_scan_n(execution_policy& policy, ForwardIt first, return ev; } +template +unique_eager_event async_inclusive_scan_n( + execution_policy& policy, ForwardIt first, Size n, OutputIt out, InitialValueType init, BinaryOp op) +{ + using InputValueT = cub::detail::InputValue; + using AccumT = typename thrust::iterator_traits::value_type; + constexpr bool ForceInclusive = true; + + using Dispatch32 = + cub::DispatchScan, + ForceInclusive>; + using Dispatch64 = + cub::DispatchScan, + ForceInclusive>; + + InputValueT init_value(init); + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, tmp_size, first, out, op, init_value, n_fixed, nullptr)); + thrust::cuda_cub::throw_on_error( + status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n(device_alloc, tmp_size); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat(std::make_tuple(std::move(content), unique_stream(nonowning, user_raw_stream)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event(std::tuple_cat( + std::make_tuple(std::move(content)), extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, tmp_size, first, out, op, init_value, n_fixed, user_raw_stream)); + thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel"); + } + + return ev; +} + } // namespace detail } // namespace cuda } // namespace system @@ -140,6 +226,23 @@ auto async_inclusive_scan( THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n( policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(op))) + // ADL entry point. + template + auto async_inclusive_scan( + execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) + THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n( + policy, first, distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op))) + } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index dab507f00f0..51356d4a582 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -109,6 +109,76 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( return result + num_items; } +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + Size num_items, + OutputIt result, + InitValueT init, + ScanOp scan_op) +{ + using InputValueT = cub::detail::InputValue; + using OffsetT = int; + using AccumT = cub::detail::accumulator_t>; + constexpr bool ForceInclusive = true; + + using Dispatch32 = + cub::DispatchScan, + ForceInclusive>; + using Dispatch64 = + cub::DispatchScan, + ForceInclusive>; + + cudaStream_t stream = thrust::cuda_cub::stream(policy); + cudaError_t status; + + // Determine temporary storage requirements: + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (nullptr, tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream)); + thrust::cuda_cub::throw_on_error( + status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Run scan: + { + // Allocate temporary storage: + thrust::detail::temporary_array tmp{policy, tmp_size}; + THRUST_INDEX_TYPE_DISPATCH2( + status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (tmp.data().get(), tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream)); + thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel"); + thrust::cuda_cub::throw_on_error( + thrust::cuda_cub::synchronize_optional(policy), "inclusive_scan failed to synchronize"); + } + + return result + num_items; +} + _CCCL_EXEC_CHECK_DISABLE template _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( @@ -172,6 +242,21 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( //------------------------- _CCCL_EXEC_CHECK_DISABLE +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan_n( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + Size num_items, + OutputIt result, + T init, + ScanOp scan_op) +{ + THRUST_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, first, num_items, result, init, scan_op);), + (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), first, first + num_items, result, scan_op);)); + return result; +} + template _CCCL_HOST_DEVICE OutputIt inclusive_scan_n( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) @@ -191,6 +276,20 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan( return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op); } +template +_CCCL_HOST_DEVICE OutputIt inclusive_scan( + thrust::cuda_cub::execution_policy& policy, + InputIt first, + InputIt last, + OutputIt result, + T init, + ScanOp scan_op) +{ + using diff_t = typename thrust::iterator_traits::difference_type; + diff_t const num_items = thrust::distance(first, last); + return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, init, scan_op); +} + template _CCCL_HOST_DEVICE OutputIt inclusive_scan(thrust::cuda_cub::execution_policy& policy, InputIt first, InputIt last, OutputIt result) From 5bdba44b72d20cc22af8ab72d4345554a9341d8f Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 28 Jun 2024 12:38:35 -0700 Subject: [PATCH 03/11] Add thrust::async::incluisve_scan with init value --- thrust/testing/async/inclusive_scan/mixin.h | 21 +++++++++- thrust/testing/async/inclusive_scan/simple.cu | 28 ++++++++++++++ thrust/thrust/async/scan.h | 38 +++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h index b0490d573c7..92ad4b1e60a 100644 --- a/thrust/testing/async/inclusive_scan/mixin.h +++ b/thrust/testing/async/inclusive_scan/mixin.h @@ -19,11 +19,30 @@ namespace inclusive_scan namespace mixin { +//------------------------------------------------------------------------------ +namespace postfix_args_init +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple // - initial_value with binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(value_type{42}, alternate_binary_op{})}; + } +}; + +} // namespace postfix_args_init + //------------------------------------------------------------------------------ namespace postfix_args { -template > +template > struct all_overloads { using postfix_args_type = std::tuple< // List any extra arg overloads: diff --git a/thrust/testing/async/inclusive_scan/simple.cu b/thrust/testing/async/inclusive_scan/simple.cu index fd93120d288..4f7ffcd90bc 100644 --- a/thrust/testing/async/inclusive_scan/simple.cu +++ b/thrust/testing/async/inclusive_scan/simple.cu @@ -5,6 +5,34 @@ # include # include +template > +struct simple_init_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args_init::all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors and initial value"; + } +}; + +template +struct test_simple_init +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_init, NumericTypes); + template > diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h index ca66455b43b..fcbb41ad43c 100644 --- a/thrust/thrust/async/scan.h +++ b/thrust/thrust/async/scan.h @@ -112,6 +112,27 @@ struct inclusive_scan_fn final THRUST_FWD(out), thrust::plus<>{})) + template + auto operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS(async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op))) + template {})) + + template >>> + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS(async_inclusive_scan( + thrust::detail::select_system(iterator_system_t>{}, + iterator_system_t>{}), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op))) }; } // namespace inclusive_scan_detail From 625fae6145a523bd47fc9e6e8830199a5aeddc77 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 3 Jul 2024 17:12:16 -0700 Subject: [PATCH 04/11] Add thrust::inclusive_scan tbb with init value --- thrust/thrust/detail/scan.inl | 43 +++--- thrust/thrust/scan.h | 43 +++--- .../system/cuda/detail/async/inclusive_scan.h | 6 +- thrust/thrust/system/cuda/detail/scan.h | 6 +- thrust/thrust/system/tbb/detail/scan.h | 4 + thrust/thrust/system/tbb/detail/scan.inl | 124 +++++++++++++++++- 6 files changed, 167 insertions(+), 59 deletions(-) diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl index 93d0c0f5b93..976d5baa818 100644 --- a/thrust/thrust/detail/scan.inl +++ b/thrust/thrust/detail/scan.inl @@ -18,6 +18,9 @@ #include +#include "cuda/std/__functional/invoke.h" +#include "cuda/std/__iterator/iterator_traits.h" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -33,23 +36,6 @@ #include #include -template -struct is_callable_with_input -{ -private: - using value_type = typename std::iterator_traits::value_type; - - template - static auto test(int) - -> decltype(std::declval()(std::declval(), std::declval()), std::true_type()); - - template - static auto test(...) -> std::false_type; - -public: - static constexpr bool value = decltype(test(0))::value; -}; - THRUST_NAMESPACE_BEGIN _CCCL_EXEC_CHECK_DISABLE @@ -66,13 +52,16 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( _CCCL_EXEC_CHECK_DISABLE template -_CCCL_HOST_DEVICE - typename std::enable_if::value, OutputIterator>::type - inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op) +_CCCL_HOST_DEVICE typename std::enable_if< + ::cuda::std::__invokable::value_type, + typename ::cuda::std::iterator_traits::value_type>::value, + OutputIterator>::type +inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op) { using thrust::system::detail::generic::inclusive_scan; return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); @@ -80,7 +69,11 @@ _CCCL_HOST_DEVICE _CCCL_EXEC_CHECK_DISABLE template -_CCCL_HOST_DEVICE typename std::enable_if::value, OutputIterator>::type +_CCCL_HOST_DEVICE typename std::enable_if< + !::cuda::std::__invokable::value_type, + typename ::cuda::std::iterator_traits::value_type>::value, + OutputIterator>::type inclusive_scan(const thrust::detail::execution_policy_base& exec, InputIterator first, InputIterator last, diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index 0a50b6e996d..f5d926e7177 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -31,6 +31,9 @@ #endif // no system header #include +#include "cuda/std/__functional/invoke.h" +#include "cuda/std/__iterator/iterator_traits.h" + THRUST_NAMESPACE_BEGIN /*! \addtogroup algorithms @@ -205,34 +208,24 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum */ -template -struct is_callable_with_input -{ -private: - using value_type = typename std::iterator_traits::value_type; - - template - static auto test(int) - -> decltype(std::declval()(std::declval(), std::declval()), std::true_type()); - - template - static auto test(...) -> std::false_type; - -public: - static constexpr bool value = decltype(test(0))::value; -}; - template -_CCCL_HOST_DEVICE - typename std::enable_if::value, OutputIterator>::type - inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op); +_CCCL_HOST_DEVICE typename std::enable_if< + ::cuda::std::__invokable::value_type, + typename ::cuda::std::iterator_traits::value_type>::value, + OutputIterator>::type +inclusive_scan(const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op); template -_CCCL_HOST_DEVICE typename std::enable_if::value, OutputIterator>::type +_CCCL_HOST_DEVICE typename std::enable_if< + !::cuda::std::__invokable::value_type, + typename ::cuda::std::iterator_traits::value_type>::value, + OutputIterator>::type inclusive_scan(const thrust::detail::execution_policy_base& exec, InputIterator first, InputIterator last, diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index 79bdc060e57..16cb3810873 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -144,7 +144,7 @@ unique_eager_event async_inclusive_scan_n( OutputIt, BinaryOp, InputValueT, - thrust::detail::int32_t, + std::int32_t, InitialValueType, cub::DeviceScanPolicy, ForceInclusive>; @@ -153,7 +153,7 @@ unique_eager_event async_inclusive_scan_n( OutputIt, BinaryOp, InputValueT, - thrust::detail::int64_t, + std::int64_t, InitialValueType, cub::DeviceScanPolicy, ForceInclusive>; @@ -180,7 +180,7 @@ unique_eager_event async_inclusive_scan_n( } // Allocate temporary storage. - auto content = uninitialized_allocate_unique_n(device_alloc, tmp_size); + auto content = uninitialized_allocate_unique_n(device_alloc, tmp_size); void* const tmp_ptr = raw_pointer_cast(content.get()); // Set up stream with dependencies. diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index 51356d4a582..df8c436c50c 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -129,7 +129,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( OutputIt, ScanOp, InputValueT, - thrust::detail::int32_t, + std::int32_t, InitValueT, cub::DeviceScanPolicy, ForceInclusive>; @@ -138,7 +138,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( OutputIt, ScanOp, InputValueT, - thrust::detail::int64_t, + std::int64_t, InitValueT, cub::DeviceScanPolicy, ForceInclusive>; @@ -164,7 +164,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( // Run scan: { // Allocate temporary storage: - thrust::detail::temporary_array tmp{policy, tmp_size}; + thrust::detail::temporary_array tmp{policy, tmp_size}; THRUST_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, diff --git a/thrust/thrust/system/tbb/detail/scan.h b/thrust/thrust/system/tbb/detail/scan.h index b0f005cb382..18a3ec3ee2c 100644 --- a/thrust/thrust/system/tbb/detail/scan.h +++ b/thrust/thrust/system/tbb/detail/scan.h @@ -43,6 +43,10 @@ template +OutputIterator +inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op); + template OutputIterator exclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op); diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index 5372eda9f6b..727ef57a558 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -46,6 +46,100 @@ namespace detail namespace scan_detail { +template +struct inclusive_body_init +{ + InputIterator input; + OutputIterator output; + thrust::detail::wrapped_function binary_op; + ValueType sum; + bool first_call; + + inclusive_body_init(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) + : input(input) + , output(output) + , binary_op(binary_op) + , sum(init) + , first_call(true) + {} + + inclusive_body_init(inclusive_body_init& b, ::tbb::split) + : input(b.input) + , output(b.output) + , binary_op(b.binary_op) + , sum(b.sum) + , first_call(true) + {} + + template + void operator()(const ::tbb::blocked_range& r, ::tbb::pre_scan_tag) + { + InputIterator iter = input + r.begin(); + + ValueType temp = *iter; + + ++iter; + + for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter) + { + temp = binary_op(temp, *iter); + } + + if (first_call) + { + sum = temp; + } + else + { + sum = binary_op(sum, temp); + } + + first_call = false; + } + + template + void operator()(const ::tbb::blocked_range& r, ::tbb::final_scan_tag) + { + InputIterator iter1 = input + r.begin(); + OutputIterator iter2 = output + r.begin(); + + if (first_call) + { + *iter2 = sum = binary_op(*iter1, sum); + ++iter1; + ++iter2; + for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2) + { + *iter2 = sum = binary_op(sum, *iter1); + } + } + else + { + for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2) + { + *iter2 = sum = binary_op(sum, *iter1); + } + } + + first_call = false; + } + + void reverse_join(inclusive_body_init& b) + { + // Only accumulate this functor's partial sum if this functor has been + // called at least once -- otherwise we'll over-count the initial value. + if (!first_call) + { + sum = binary_op(b.sum, sum); + } + } + + void assign(inclusive_body_init& b) + { + sum = b.sum; + } +}; + template struct inclusive_body { @@ -55,11 +149,11 @@ struct inclusive_body ValueType sum; bool first_call; - inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy) + inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) : input(input) , output(output) - , binary_op{binary_op} - , sum(dummy) + , binary_op(binary_op) + , sum(init) , first_call(true) {} @@ -249,6 +343,30 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu return result; } +template +OutputIterator inclusive_scan( + tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op) +{ + using namespace thrust::detail; + + // Use the input iterator's value type per https://wg21.link/P0571 + using ValueType = InitialValueType; + + using Size = typename thrust::iterator_difference::type; + Size n = thrust::distance(first, last); + + if (n != 0) + { + typedef typename scan_detail::inclusive_body_init Body; + Body scan_body(first, result, binary_op, init); + ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); + } + + thrust::advance(result, n); + + return result; +} + template OutputIterator exclusive_scan( tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op) From d8f23765359d9ce1604df08479daf6b794f5e282 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 24 Jul 2024 15:38:37 -0700 Subject: [PATCH 05/11] Handle reviews --- thrust/testing/scan.cu | 26 +++----- thrust/thrust/detail/scan.inl | 59 +++++++++++-------- thrust/thrust/scan.h | 53 +++++++++-------- .../system/cuda/detail/async/inclusive_scan.h | 4 +- thrust/thrust/system/cuda/detail/scan.h | 4 +- thrust/thrust/system/detail/sequential/scan.h | 11 ++-- thrust/thrust/system/tbb/detail/scan.inl | 2 +- 7 files changed, 83 insertions(+), 76 deletions(-) diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu index c02c9001b0b..4939521d4b0 100644 --- a/thrust/testing/scan.cu +++ b/thrust/testing/scan.cu @@ -68,12 +68,8 @@ void TestScanSimple() ASSERT_EQUAL(output, result); // inclusive scan with init - iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3)); - result[0] = 4; - result[1] = 7; - result[2] = 5; - result[3] = 9; - result[4] = 4; + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3)); + result = {4, 7, 5, 9, 4}; ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); @@ -86,12 +82,8 @@ void TestScanSimple() ASSERT_EQUAL(output, result); // inclusive scan with init and op - iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); - result[0] = 4; - result[1] = 7; - result[2] = 5; - result[3] = 9; - result[4] = 4; + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); + result = {4, 7, 5, 9, 4}; ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); @@ -111,13 +103,9 @@ void TestScanSimple() ASSERT_EQUAL(input, result); // inplace inclusive scan with init - input = input_copy; - iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3)); - result[0] = 4; - result[1] = 7; - result[2] = 5; - result[3] = 9; - result[4] = 4; + input = input_copy; + iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3)); + result = {4, 7, 5, 9, 4}; ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); ASSERT_EQUAL(input, result); diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl index 976d5baa818..329e54ef7ad 100644 --- a/thrust/thrust/detail/scan.inl +++ b/thrust/thrust/detail/scan.inl @@ -36,6 +36,8 @@ #include #include +#include + THRUST_NAMESPACE_BEGIN _CCCL_EXEC_CHECK_DISABLE @@ -50,35 +52,42 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result); } // end inclusive_scan() -_CCCL_EXEC_CHECK_DISABLE -template -_CCCL_HOST_DEVICE typename std::enable_if< - ::cuda::std::__invokable::value_type, - typename ::cuda::std::iterator_traits::value_type>::value, - OutputIterator>::type -inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op) +template ::value_type, + typename std::iterator_traits::value_type>::value, + int>::type> +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op) { using thrust::system::detail::generic::inclusive_scan; return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); } // end inclusive_scan() _CCCL_EXEC_CHECK_DISABLE -template -_CCCL_HOST_DEVICE typename std::enable_if< - !::cuda::std::__invokable::value_type, - typename ::cuda::std::iterator_traits::value_type>::value, - OutputIterator>::type -inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - T init) +template ::value_type, + typename std::iterator_traits::value_type>::value, + int>::type> +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init) { using thrust::system::detail::generic::inclusive_scan; return inclusive_scan( @@ -314,8 +323,8 @@ inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T { using thrust::system::detail::generic::select_system; - typedef typename thrust::iterator_system::type System1; - typedef typename thrust::iterator_system::type System2; + using System1 = typename thrust::iterator_system::type; + using System2 = typename thrust::iterator_system::type; System1 system1; System2 system2; diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index f5d926e7177..4c90834bc82 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -207,30 +207,37 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte * * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum */ +template ::value_type, + typename std::iterator_traits::value_type>::value, + int>::type = 0> +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + AssociativeOperator binary_op); -template -_CCCL_HOST_DEVICE typename std::enable_if< - ::cuda::std::__invokable::value_type, - typename ::cuda::std::iterator_traits::value_type>::value, - OutputIterator>::type -inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op); - -template -_CCCL_HOST_DEVICE typename std::enable_if< - !::cuda::std::__invokable::value_type, - typename ::cuda::std::iterator_traits::value_type>::value, - OutputIterator>::type -inclusive_scan(const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - T init); +template ::value_type, + typename std::iterator_traits::value_type>::value, + int>::type = 0> +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init); /*! \p inclusive_scan computes an inclusive prefix sum operation. The * term 'inclusive' means that each result includes the corresponding diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index 16cb3810873..4b39d7018d6 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -145,7 +145,7 @@ unique_eager_event async_inclusive_scan_n( BinaryOp, InputValueT, std::int32_t, - InitialValueType, + AccumT, cub::DeviceScanPolicy, ForceInclusive>; using Dispatch64 = @@ -154,7 +154,7 @@ unique_eager_event async_inclusive_scan_n( BinaryOp, InputValueT, std::int64_t, - InitialValueType, + AccumT, cub::DeviceScanPolicy, ForceInclusive>; diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index df8c436c50c..1cf8d1d6d14 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -130,7 +130,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( ScanOp, InputValueT, std::int32_t, - InitValueT, + AccumT, cub::DeviceScanPolicy, ForceInclusive>; using Dispatch64 = @@ -139,7 +139,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( ScanOp, InputValueT, std::int64_t, - InitValueT, + AccumT, cub::DeviceScanPolicy, ForceInclusive>; diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index 6376710392d..37afce18308 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -99,13 +99,16 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( if (first != last) { - ValueType sum = wrapped_binary_op(*first, init); + ValueType sum = wrapped_binary_op(init, *first); + *result = sum; + ++first; + ++result; - *result = sum; - - for (++first, ++result; first != last; ++first, ++result) + while (first != last) { *result = sum = wrapped_binary_op(sum, *first); + ++first; + ++result; } } diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index 727ef57a558..d239f8bab63 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -357,7 +357,7 @@ OutputIterator inclusive_scan( if (n != 0) { - typedef typename scan_detail::inclusive_body_init Body; + using Body = typename scan_detail::inclusive_body_init; Body scan_body(first, result, binary_op, init); ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); } From 2e0e56e24dfcae67fbc6b47625bf4936b1f2a1ae Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 25 Jul 2024 19:00:19 -0700 Subject: [PATCH 06/11] Consolidate init overloads into a single overload that accepts both init and binary_op --- thrust/testing/scan.cu | 11 +- thrust/thrust/detail/scan.inl | 38 +---- thrust/thrust/scan.h | 149 ++++++++++++++---- thrust/thrust/system/cuda/detail/scan.h | 1 - thrust/thrust/system/detail/sequential/scan.h | 4 +- thrust/thrust/system/tbb/detail/scan.inl | 4 +- 6 files changed, 126 insertions(+), 81 deletions(-) diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu index 4939521d4b0..e9ea8878047 100644 --- a/thrust/testing/scan.cu +++ b/thrust/testing/scan.cu @@ -67,13 +67,6 @@ void TestScanSimple() ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); - // inclusive scan with init - iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3)); - result = {4, 7, 5, 9, 4}; - ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); - ASSERT_EQUAL(input, input_copy); - ASSERT_EQUAL(output, result); - // inclusive scan with op iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus()); result = {1, 4, 2, 6, 1}; @@ -102,9 +95,9 @@ void TestScanSimple() ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); ASSERT_EQUAL(input, result); - // inplace inclusive scan with init + // inplace inclusive scan with init and op input = input_copy; - iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3)); + iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3), thrust::plus()); result = {4, 7, 5, 9, 4}; ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size()); ASSERT_EQUAL(input, result); diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl index 329e54ef7ad..fc046e8b77c 100644 --- a/thrust/thrust/detail/scan.inl +++ b/thrust/thrust/detail/scan.inl @@ -18,9 +18,6 @@ #include -#include "cuda/std/__functional/invoke.h" -#include "cuda/std/__iterator/iterator_traits.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -36,8 +33,6 @@ #include #include -#include - THRUST_NAMESPACE_BEGIN _CCCL_EXEC_CHECK_DISABLE @@ -52,15 +47,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result); } // end inclusive_scan() -template ::value_type, - typename std::iterator_traits::value_type>::value, - int>::type> +_CCCL_EXEC_CHECK_DISABLE +template _CCCL_HOST_DEVICE OutputIterator inclusive_scan( const thrust::detail::execution_policy_base& exec, InputIterator first, @@ -72,28 +60,6 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op); } // end inclusive_scan() -_CCCL_EXEC_CHECK_DISABLE -template ::value_type, - typename std::iterator_traits::value_type>::value, - int>::type> -_CCCL_HOST_DEVICE OutputIterator inclusive_scan( - const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - T init) -{ - using thrust::system::detail::generic::inclusive_scan; - return inclusive_scan( - thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, thrust::plus<>()); -} // end inclusive_scan() - _CCCL_EXEC_CHECK_DISABLE template _CCCL_HOST_DEVICE OutputIterator inclusive_scan( diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index 4c90834bc82..b15f49b591b 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -31,9 +31,6 @@ #endif // no system header #include -#include "cuda/std/__functional/invoke.h" -#include "cuda/std/__iterator/iterator_traits.h" - THRUST_NAMESPACE_BEGIN /*! \addtogroup algorithms @@ -207,15 +204,7 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte * * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum */ -template ::value_type, - typename std::iterator_traits::value_type>::value, - int>::type = 0> +template _CCCL_HOST_DEVICE OutputIterator inclusive_scan( const thrust::detail::execution_policy_base& exec, InputIterator first, @@ -223,22 +212,6 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( OutputIterator result, AssociativeOperator binary_op); -template ::value_type, - typename std::iterator_traits::value_type>::value, - int>::type = 0> -_CCCL_HOST_DEVICE OutputIterator inclusive_scan( - const thrust::detail::execution_policy_base& exec, - InputIterator first, - InputIterator last, - OutputIterator result, - T init); - /*! \p inclusive_scan computes an inclusive prefix sum operation. The * term 'inclusive' means that each result includes the corresponding * input operand in the partial sum. When the input and output sequences @@ -261,9 +234,11 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( * * \tparam InputIterator is a model of Input * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam - * OutputIterator is a model of Output - * Iterator and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c - * first_argument_type and \c second_argument_type. \tparam AssociativeOperator is a model of Output Iterator + * and \c OutputIterator's \c value_type is convertible to + * both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. + * \tparam AssociativeOperator is a model of Binary Function and \c * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. * @@ -288,6 +263,118 @@ template binary_op(init, \*first) is assigned to *result + * and so on. This version of \p inclusive_scan equires both an associative + * operator and an initial value \p init. When the input and + * output sequences are the same, the scan is performed in-place. + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * + * The algorithm's execution is parallelized as determined by \p exec. + * + * \param exec The execution policy to use for parallelization. + * \param first The beginning of the input sequence. + * \param last The end of the input sequence. + * \param result The beginning of the output sequence. + * \param init The initial value. + * \param binary_op The associatve operator used to 'sum' values. + * \return The end of the output sequence. + * + * \tparam DerivedPolicy The name of the derived execution policy. + * \tparam InputIterator is a model of Input + * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam + * OutputIterator is a model of Output Iterator + * and \c OutputIterator's \c value_type is convertible to + * both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. + * \tparam T is convertible to \c OutputIterator's \c value_type. + * \tparam AssociativeOperator is a model of Binary Function and \c + * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. + * + * \pre \p first may equal \p result but the range [first, last) and the range [result, result + (last - + * first)) shall not overlap otherwise. + * + * The following code snippet demonstrates how to use \p inclusive_scan with initial value to compute an in-place + * prefix sum using the \p thrust::host execution policy for parallelization: + * + * \code + * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; + * + * thrust::maximum binary_op; + * + * thrust::inclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan + * + * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} + * \endcode + * + * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum + */ +template +_CCCL_HOST_DEVICE OutputIterator inclusive_scan( + const thrust::detail::execution_policy_base& exec, + InputIterator first, + InputIterator last, + OutputIterator result, + T init, + AssociativeOperator binary_op); + +/*! \p inclusive_scan computes an inclusive prefix sum operation. The + * term 'inclusive' means that each result includes the corresponding + * input operand in the partial sum. More precisely, + * binary_op(init, \*first) is assigned to *result + * and so on. This version of \p inclusive_scan equires both an associative + * operator and an initial value \p init. When the input and + * output sequences are the same, the scan is performed in-place. + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * + * \param first The beginning of the input sequence. + * \param last The end of the input sequence. + * \param result The beginning of the output sequence. + * \param init The initial value. + * \param binary_op The associatve operator used to 'sum' values. + * \return The end of the output sequence. + * + * \tparam InputIterator is a model of Input + * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam + * OutputIterator is a model of Output Iterator + * and \c OutputIterator's \c value_type is convertible to + * both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. + * \tparam T is convertible to \c OutputIterator's \c value_type. + * \tparam AssociativeOperator is a model of Binary Function and \c + * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type. + * + * \pre \p first may equal \p result but the range [first, last) and the range [result, result + (last - + * first)) shall not overlap otherwise. + * + * The following code snippet demonstrates how to use \p inclusive_scan with initial value: + * + * \code + * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; + * + * thrust::maximum binary_op; + * + * thrust::inclusive_scan(data, data + 10, data, 1, binary_op); // in-place scan + * + * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} + * \endcode + * + * \see https://en.cppreference.com/w/cpp/algorithm/partial_sum + */ +template +OutputIterator +inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, AssociativeOperator binary_op); + /*! \p exclusive_scan computes an exclusive prefix sum operation. The * term 'exclusive' means that each result does not include the * corresponding input operand in the partial sum. More precisely, diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index 1cf8d1d6d14..fa58de662bb 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -120,7 +120,6 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( ScanOp scan_op) { using InputValueT = cub::detail::InputValue; - using OffsetT = int; using AccumT = cub::detail::accumulator_t>; constexpr bool ForceInclusive = true; diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index 37afce18308..11677c7aad1 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -92,10 +92,10 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( using namespace thrust::detail; // Use the input iterator's value type per https://wg21.link/P0571 - using ValueType = InitialValueType; + using ValueType = typename thrust::iterator_value::type; // wrap binary_op - thrust::detail::wrapped_function wrapped_binary_op(binary_op); + thrust::detail::wrapped_function wrapped_binary_op{binary_op}; if (first != last) { diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index d239f8bab63..99a19a42b0f 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -58,7 +58,7 @@ struct inclusive_body_init inclusive_body_init(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) : input(input) , output(output) - , binary_op(binary_op) + , binary_op{binary_op} , sum(init) , first_call(true) {} @@ -152,7 +152,7 @@ struct inclusive_body inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) : input(input) , output(output) - , binary_op(binary_op) + , binary_op{binary_op} , sum(init) , first_call(true) {} From df43e5b91fdbaf43525c7e40c06f7469d33a4a6d Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 1 Aug 2024 14:16:32 -0700 Subject: [PATCH 07/11] Fix formatting issues --- thrust/testing/async/inclusive_scan/mixin.h | 7 +++--- thrust/thrust/scan.h | 25 +++++++-------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h index 92ad4b1e60a..c11766207dd 100644 --- a/thrust/testing/async/inclusive_scan/mixin.h +++ b/thrust/testing/async/inclusive_scan/mixin.h @@ -19,20 +19,19 @@ namespace inclusive_scan namespace mixin { -//------------------------------------------------------------------------------ namespace postfix_args_init { -template > +template > struct all_overloads { using postfix_args_type = std::tuple< // List any extra arg overloads: - std::tuple // - initial_value with binary_op + std::tuple // - initial_value with binary_op >; static postfix_args_type generate_postfix_args() { - return postfix_args_type{std::make_tuple(value_type{42}, alternate_binary_op{})}; + return postfix_args_type{std::make_tuple(ValueType{42}, AltBinaryOp{})}; } }; diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h index b15f49b591b..e54265ecff8 100644 --- a/thrust/thrust/scan.h +++ b/thrust/thrust/scan.h @@ -267,7 +267,7 @@ inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, A * term 'inclusive' means that each result includes the corresponding * input operand in the partial sum. More precisely, * binary_op(init, \*first) is assigned to *result - * and so on. This version of \p inclusive_scan equires both an associative + * and so on. This version of \p inclusive_scan requires both an associative * operator and an initial value \p init. When the input and * output sequences are the same, the scan is performed in-place. * @@ -289,9 +289,8 @@ inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, A * \tparam InputIterator is a model of Input * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam * OutputIterator is a model of Output Iterator - * and \c OutputIterator's \c value_type is convertible to - * both \c AssociativeOperator's \c first_argument_type and - * \c second_argument_type. + * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. * \tparam T is convertible to \c OutputIterator's \c value_type. * \tparam AssociativeOperator is a model of Binary Function and \c @@ -305,11 +304,7 @@ inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, A * * \code * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; - * - * thrust::maximum binary_op; - * - * thrust::inclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan - * + * thrust::inclusive_scan(thrust::host, data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} * \endcode * @@ -328,7 +323,7 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( * term 'inclusive' means that each result includes the corresponding * input operand in the partial sum. More precisely, * binary_op(init, \*first) is assigned to *result - * and so on. This version of \p inclusive_scan equires both an associative + * and so on. This version of \p inclusive_scan requires both an associative * operator and an initial value \p init. When the input and * output sequences are the same, the scan is performed in-place. * @@ -346,9 +341,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( * \tparam InputIterator is a model of Input * Iterator and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam * OutputIterator is a model of Output Iterator - * and \c OutputIterator's \c value_type is convertible to - * both \c AssociativeOperator's \c first_argument_type and - * \c second_argument_type. + * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and + * \c second_argument_type. * \tparam T is convertible to \c OutputIterator's \c value_type. * \tparam AssociativeOperator is a model of Binary Function and \c @@ -361,11 +355,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( * * \code * int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8}; - * * thrust::maximum binary_op; - * - * thrust::inclusive_scan(data, data + 10, data, 1, binary_op); // in-place scan - * + * thrust::inclusive_scan(data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan * // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8} * \endcode * From 3df74cb0309db2391332c8abbc42ca22ae5cc3ad Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 13 Aug 2024 15:56:32 -0700 Subject: [PATCH 08/11] Add cuda::std::accumulator_t and use it for value_type in scan algorithms --- libcudacxx/include/cuda/std/__functional/invoke.h | 4 ++++ thrust/testing/async/inclusive_scan/mixin.h | 2 +- thrust/thrust/system/cuda/detail/async/inclusive_scan.h | 5 +++-- thrust/thrust/system/cuda/detail/scan.h | 5 ++--- thrust/thrust/system/detail/sequential/scan.h | 5 ++++- thrust/thrust/system/tbb/detail/scan.inl | 5 ++++- 6 files changed, 18 insertions(+), 8 deletions(-) diff --git a/libcudacxx/include/cuda/std/__functional/invoke.h b/libcudacxx/include/cuda/std/__functional/invoke.h index 0e132158f50..ee864f7a342 100644 --- a/libcudacxx/include/cuda/std/__functional/invoke.h +++ b/libcudacxx/include/cuda/std/__functional/invoke.h @@ -541,6 +541,10 @@ invoke(_Fn&& __f, _Args&&... __args) noexcept(is_nothrow_invocable_v<_Fn, _Args. #endif // _CCCL_STD_VER > 2011 +/// The type of intermediate accumulator (according to P2322R6) +template +using __accumulator_t = typename decay::type>::type; + _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___FUNCTIONAL_INVOKE_H diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h index c11766207dd..f3538f5eb94 100644 --- a/thrust/testing/async/inclusive_scan/mixin.h +++ b/thrust/testing/async/inclusive_scan/mixin.h @@ -41,7 +41,7 @@ struct all_overloads namespace postfix_args { -template > +template > struct all_overloads { using postfix_args_type = std::tuple< // List any extra arg overloads: diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index 4b39d7018d6..0100fc165b5 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -135,8 +135,9 @@ template & policy, ForwardIt first, Size n, OutputIt out, InitialValueType init, BinaryOp op) { - using InputValueT = cub::detail::InputValue; - using AccumT = typename thrust::iterator_traits::value_type; + using InputValueT = cub::detail::InputValue; + using AccumT = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; constexpr bool ForceInclusive = true; using Dispatch32 = diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index fa58de662bb..9fd6e8dee41 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -119,8 +119,8 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( InitValueT init, ScanOp scan_op) { - using InputValueT = cub::detail::InputValue; - using AccumT = cub::detail::accumulator_t>; + using InputValueT = cub::detail::InputValue; + using AccumT = typename ::cuda::std::__accumulator_t, InitValueT>; constexpr bool ForceInclusive = true; using Dispatch32 = @@ -240,7 +240,6 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( // Thrust API entry points //------------------------- -_CCCL_EXEC_CHECK_DISABLE template _CCCL_HOST_DEVICE OutputIt inclusive_scan_n( thrust::cuda_cub::execution_policy& policy, diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index 11677c7aad1..b4e4e5353ec 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -35,6 +35,8 @@ #include #include +#include + THRUST_NAMESPACE_BEGIN namespace system { @@ -92,7 +94,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( using namespace thrust::detail; // Use the input iterator's value type per https://wg21.link/P0571 - using ValueType = typename thrust::iterator_value::type; + using ValueType = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; // wrap binary_op thrust::detail::wrapped_function wrapped_binary_op{binary_op}; diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index 99a19a42b0f..82c2e0a78e1 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -33,6 +33,8 @@ #include #include +#include + #include #include @@ -350,7 +352,8 @@ OutputIterator inclusive_scan( using namespace thrust::detail; // Use the input iterator's value type per https://wg21.link/P0571 - using ValueType = InitialValueType; + using ValueType = typename ::cuda::std:: + __accumulator_t::value_type, InitialValueType>; using Size = typename thrust::iterator_difference::type; Size n = thrust::distance(first, last); From 5b8a9b0011ef481f2faeee66bfab51e3e75e556e Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 19 Aug 2024 15:34:04 -0700 Subject: [PATCH 09/11] Redo Bernhard's work and consolidate the two tbb::inclusive_scan bodies --- thrust/thrust/system/tbb/detail/scan.inl | 109 +++-------------------- 1 file changed, 11 insertions(+), 98 deletions(-) diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index 82c2e0a78e1..c481c2181ef 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -48,101 +48,7 @@ namespace detail namespace scan_detail { -template -struct inclusive_body_init -{ - InputIterator input; - OutputIterator output; - thrust::detail::wrapped_function binary_op; - ValueType sum; - bool first_call; - - inclusive_body_init(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init) - : input(input) - , output(output) - , binary_op{binary_op} - , sum(init) - , first_call(true) - {} - - inclusive_body_init(inclusive_body_init& b, ::tbb::split) - : input(b.input) - , output(b.output) - , binary_op(b.binary_op) - , sum(b.sum) - , first_call(true) - {} - - template - void operator()(const ::tbb::blocked_range& r, ::tbb::pre_scan_tag) - { - InputIterator iter = input + r.begin(); - - ValueType temp = *iter; - - ++iter; - - for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter) - { - temp = binary_op(temp, *iter); - } - - if (first_call) - { - sum = temp; - } - else - { - sum = binary_op(sum, temp); - } - - first_call = false; - } - - template - void operator()(const ::tbb::blocked_range& r, ::tbb::final_scan_tag) - { - InputIterator iter1 = input + r.begin(); - OutputIterator iter2 = output + r.begin(); - - if (first_call) - { - *iter2 = sum = binary_op(*iter1, sum); - ++iter1; - ++iter2; - for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2) - { - *iter2 = sum = binary_op(sum, *iter1); - } - } - else - { - for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2) - { - *iter2 = sum = binary_op(sum, *iter1); - } - } - - first_call = false; - } - - void reverse_join(inclusive_body_init& b) - { - // Only accumulate this functor's partial sum if this functor has been - // called at least once -- otherwise we'll over-count the initial value. - if (!first_call) - { - sum = binary_op(b.sum, sum); - } - } - - void assign(inclusive_body_init& b) - { - sum = b.sum; - } -}; - -template +template struct inclusive_body { InputIterator input; @@ -201,7 +107,14 @@ struct inclusive_body if (first_call) { - *iter2 = sum = *iter1; + _CCCL_IF_CONSTEXPR (HasInit) + { + *iter2 = sum = binary_op(*iter1, sum); + } + else + { + *iter2 = sum = *iter1; + } ++iter1; ++iter2; for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2) @@ -335,7 +248,7 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu if (n != 0) { - using Body = typename scan_detail::inclusive_body; + using Body = typename scan_detail::inclusive_body; Body scan_body(first, result, binary_op, *first); ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); } @@ -360,7 +273,7 @@ OutputIterator inclusive_scan( if (n != 0) { - using Body = typename scan_detail::inclusive_body_init; + using Body = typename scan_detail::inclusive_body; Body scan_body(first, result, binary_op, init); ::tbb::parallel_scan(::tbb::blocked_range(0, n), scan_body); } From 2713e819f07586c16b5cfaee7a14752c9c434c12 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 22 Aug 2024 15:54:34 -0700 Subject: [PATCH 10/11] Handle final reviews --- thrust/testing/scan.cu | 4 ++-- thrust/thrust/system/cuda/detail/async/inclusive_scan.h | 2 +- thrust/thrust/system/cuda/detail/scan.h | 6 ++++++ thrust/thrust/system/detail/sequential/scan.h | 1 - thrust/thrust/system/tbb/detail/scan.inl | 2 +- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu index e9ea8878047..e657d637a57 100644 --- a/thrust/testing/scan.cu +++ b/thrust/testing/scan.cu @@ -75,8 +75,8 @@ void TestScanSimple() ASSERT_EQUAL(output, result); // inclusive scan with init and op - iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus()); - result = {4, 7, 5, 9, 4}; + iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(-1), thrust::multiplies()); + result = {-1, -3, 6, 24, -120}; ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size()); ASSERT_EQUAL(input, input_copy); ASSERT_EQUAL(output, result); diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index 0100fc165b5..a91310dce0b 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -242,7 +242,7 @@ auto async_inclusive_scan( InitialValueType&& init, BinaryOp&& op) THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n( - policy, first, distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op))) + policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op))) } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index 9fd6e8dee41..862eb7bf6cf 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -145,6 +145,12 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if (thrust::detail::is_negative(num_items)) + { + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h index b4e4e5353ec..c883dac6543 100644 --- a/thrust/thrust/system/detail/sequential/scan.h +++ b/thrust/thrust/system/detail/sequential/scan.h @@ -93,7 +93,6 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan( { using namespace thrust::detail; - // Use the input iterator's value type per https://wg21.link/P0571 using ValueType = typename ::cuda::std:: __accumulator_t::value_type, InitialValueType>; diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index c481c2181ef..d002c907d27 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -109,7 +109,7 @@ struct inclusive_body { _CCCL_IF_CONSTEXPR (HasInit) { - *iter2 = sum = binary_op(*iter1, sum); + *iter2 = sum = binary_op(sum, *iter1); } else { From 611e6617a41a766babe785a7fbfac6e698405ac4 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 26 Aug 2024 12:16:36 -0700 Subject: [PATCH 11/11] Replace cub::accumulator_t with cuda::std::__accumulator_t --- cub/benchmarks/bench/scan/exclusive/base.cuh | 4 +- cub/benchmarks/bench/scan/exclusive/by_key.cu | 2 +- cub/cub/detail/type_traits.cuh | 4 -- cub/cub/device/device_run_length_encode.cuh | 4 +- cub/cub/device/device_scan.cuh | 4 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 25 ++++++------ .../dispatch/dispatch_reduce_by_key.cuh | 38 +++++++++---------- cub/cub/device/dispatch/dispatch_scan.cuh | 10 ++--- .../device/dispatch/dispatch_scan_by_key.cuh | 6 +-- cub/cub/thread/thread_reduce.cuh | 6 +-- cub/test/catch2_test_device_reduce.cu | 4 +- cub/test/catch2_test_device_reduce_by_key.cu | 2 +- ...ch2_test_device_reduce_by_key_iterators.cu | 2 +- .../catch2_test_device_reduce_iterators.cu | 2 +- cub/test/catch2_test_device_scan.cu | 12 +++--- cub/test/catch2_test_device_scan.cuh | 8 ++-- cub/test/catch2_test_device_scan_iterators.cu | 10 ++--- .../catch2_test_device_segmented_reduce.cu | 4 +- ..._test_device_segmented_reduce_iterators.cu | 2 +- thrust/thrust/system/tbb/detail/scan.inl | 2 +- 20 files changed, 76 insertions(+), 75 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 42897931679..e3cd7a7be8e 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -27,6 +27,8 @@ #include +#include + #include #if !TUNE_BASE @@ -85,7 +87,7 @@ template static void basic(nvbench::state& state, nvbench::type_list) { using init_t = cub::detail::InputValue; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using input_it_t = const T*; using output_it_t = T*; using offset_t = OffsetT; diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 3830ad7764a..26676d66c22 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list; + using accum_t = ::cuda::std::__accumulator_t; using key_input_it_t = const KeyT*; using val_input_it_t = const ValueT*; using val_output_it_t = ValueT*; diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 10d40cacd16..ed505bb1fc0 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -62,9 +62,5 @@ using invoke_result_t = ::cuda::std::invoke_result_t; #endif -/// The type of intermediate accumulator (according to P2322R6) -template -using accumulator_t = typename ::cuda::std::decay>::type; - } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 9020f4fe055..120562a4611 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -34,6 +34,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -200,7 +202,7 @@ struct DeviceRunLengthEncode // Generator type for providing 1s values for run-length reduction using lengths_input_iterator_t = ConstantInputIterator; - using accum_t = detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using key_t = cub::detail::non_void_value_t>; diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index c9d93c935b5..27882e9ceed 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -49,6 +49,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -1303,7 +1305,7 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using AccumT = cub::detail::accumulator_t>; + using AccumT = ::cuda::std::__accumulator_t, InitValueT>; constexpr bool ForceInclusive = true; return DispatchScan< diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index e3e3844a3f4..ba5365c6181 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -316,7 +316,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy, typename TransformOpT = ::cuda::std::__identity> struct DispatchReduce : SelectedPolicy @@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy * @tparam InitT * Initial value type */ -template >>, - typename SelectedPolicyT = DeviceReducePolicy> +template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename TransformOpT, + typename InitT, + typename AccumT = ::cuda::std:: + __accumulator_t>, InitT>, + typename SelectedPolicyT = DeviceReducePolicy> using DispatchTransformReduce = DispatchReduce; @@ -850,7 +849,7 @@ template >, - typename AccumT = detail::accumulator_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 07dd492a53b..8ae232e8d1e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ -template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT, - typename AccumT = // - detail:: - accumulator_t, cub::detail::value_t>, - typename SelectedPolicy = // - detail::device_reduce_by_key_policy_hub< // - ReductionOpT, // - AccumT, // - cub::detail::non_void_value_t< // - UniqueOutputIteratorT, // - cub::detail::value_t>>> +template , + cub::detail::value_t>, + typename SelectedPolicy = // + detail::device_reduce_by_key_policy_hub< // + ReductionOpT, // + AccumT, // + cub::detail::non_void_value_t< // + UniqueOutputIteratorT, // + cub::detail::value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 56c2be96112..7d2fc4ac17d 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -234,11 +234,11 @@ template ::value, - cub::detail::value_t, - typename InitValueT::value_type>, - cub::detail::value_t>, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If::value, + cub::detail::value_t, + typename InitValueT::value_type>>, typename SelectedPolicy = DeviceScanPolicy, bool ForceInclusive = false> struct DispatchScan : SelectedPolicy diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 032554773a7..5dfffa5e77e 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -228,10 +228,10 @@ template < typename ScanOpT, typename InitValueT, typename OffsetT, - typename AccumT = detail::accumulator_t< + typename AccumT = ::cuda::std::__accumulator_t< ScanOpT, - ::cuda::std::_If::value, cub::detail::value_t, InitValueT>, - cub::detail::value_t>, + cub::detail::value_t, + ::cuda::std::_If::value, cub::detail::value_t, InitValueT>>, typename SelectedPolicy = DeviceScanByKeyPolicy, ScanOpT>> struct DispatchScanByKey : SelectedPolicy diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index db7e2427791..7ac98369254 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -68,7 +68,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) { @@ -110,7 +110,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); @@ -170,7 +170,7 @@ template > + typename AccumT = ::cuda::std::__accumulator_t> _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 1e9e08c9113..290e8d8f6a9 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -132,7 +132,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = static_cast(compute_single_problem_reference(in_items, reduction_op, accum_t{})); @@ -152,7 +152,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data output_t expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu index 39f31d5e781..88c305fd360 100644 --- a/cub/test/catch2_test_device_reduce_by_key.cu +++ b/cub/test/catch2_test_device_reduce_by_key.cu @@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu index 3637813b5f6..14b7fcde9fd 100644 --- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu +++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu @@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]" using op_t = cub::Sum; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin()); c2h::host_vector expected_keys = compute_unique_keys_reference(segment_keys); diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index 7c7f74ec63a..ab1dabbbb16 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{}); // Run test diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 49c9aac39c7..736e217b0ea 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector host_items(in_items); @@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("inclusive scan with init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); @@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_ SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Scan operator auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index d3644e3387a..dc5b7804e8d 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -61,7 +61,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -75,7 +75,7 @@ template ; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; if (num_items > 0) @@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference( std::size_t num_items) { using value_t = cub::detail::value_t; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::value_t; for (std::size_t i = 0; i < num_items;) diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index 576d0d3f747..a07397cc367 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -84,7 +84,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -102,7 +102,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -120,7 +120,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("inclusive scan") { using op_t = cub::Min; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -139,7 +139,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_items); @@ -157,7 +157,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis SECTION("exclusive scan with future-init value") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data accum_t init_value{}; diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 770b85b0194..5559e7e2e81 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -121,7 +121,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_items, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); @@ -142,7 +142,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; // Prepare verification data c2h::host_vector expected_result(num_segments); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index 8ab495ddc59..a81559b91ed 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -93,7 +93,7 @@ CUB_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl index d002c907d27..683ed226025 100644 --- a/thrust/thrust/system/tbb/detail/scan.inl +++ b/thrust/thrust/system/tbb/detail/scan.inl @@ -264,7 +264,7 @@ OutputIterator inclusive_scan( { using namespace thrust::detail; - // Use the input iterator's value type per https://wg21.link/P0571 + // Use the input iterator's value type and the initial value type per wg21.link/p2322 using ValueType = typename ::cuda::std:: __accumulator_t::value_type, InitialValueType>;