From ce4b01b4f9275ab8c66beb7a3bd7862fa3d4eafe Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 20 May 2024 14:44:53 -0700 Subject: [PATCH] Add docs for block scan inclusive scan --- cub/cub/block/block_scan.cuh | 322 +++++++++++++++++++++++++---- cub/test/catch2_test_block_scan.cu | 2 - 2 files changed, 281 insertions(+), 43 deletions(-) diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index f3fdc7d8de..71d035e8d7 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -819,12 +819,6 @@ public: InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } - template - _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op) - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op); - } - //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. @@ -892,13 +886,6 @@ public: InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } - template - _CCCL_DEVICE _CCCL_FORCEINLINE void - InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate) - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op, block_aggregate); - } - //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by @@ -1085,20 +1072,6 @@ public: internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } - template - _CCCL_DEVICE _CCCL_FORCEINLINE void - InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op) - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. @@ -1180,20 +1153,6 @@ public: internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } - template - _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate) - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. @@ -2031,6 +1990,65 @@ public: InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } + //! @rst + //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. + //! Each thread contributes one input element. + //! + //! - Supports non-commutative scan operators. + //! - @rowmajor + //! - @smemreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + //! are partitioned across 128 threads. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(...) + //! { + //! // Specialize BlockScan for a 1D block of 128 threads of type int + //! typedef cub::BlockScan BlockScan; + //! + //! // Allocate shared memory for BlockScan + //! __shared__ typename BlockScan::TempStorage temp_storage; + //! + //! // Obtain input item for each thread + //! int thread_data; + //! ... + //! + //! // Collectively compute the block-wide inclusive prefix max scan + //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + //! + //! Suppose the set of input ``thread_data`` across the block of threads is + //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` + //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. + //! + //! @endrst + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] input + //! Calling thread's input item + //! + //! @param[out] output + //! Calling thread's output item (may be aliased to `input`) + //! + //! @param[in] initial_value + //! Initial value to seed the invlusive scan + //! + //! @param[in] scan_op + //! Binary scan functor + template + _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op) + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op); + } + //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. Also provides every thread with the block-wide @@ -2093,6 +2111,72 @@ public: InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); } + //! @rst + //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. + //! Each thread contributes one input element. Also provides every thread with the block-wide + //! ``block_aggregate`` of all inputs. + //! + //! - Supports non-commutative scan operators. + //! - @rowmajor + //! - @smemreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates an inclusive prefix max scan of 128 + //! integer items that are partitioned across 128 threads. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(...) + //! { + //! // Specialize BlockScan for a 1D block of 128 threads of type int + //! typedef cub::BlockScan BlockScan; + //! + //! // Allocate shared memory for BlockScan + //! __shared__ typename BlockScan::TempStorage temp_storage; + //! + //! // Obtain input item for each thread + //! int thread_data; + //! ... + //! + //! // Collectively compute the block-wide inclusive prefix max scan + //! int block_aggregate; + //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + //! + //! Suppose the set of input ``thread_data`` across the block of threads is + //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` + //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value + //! ``126`` will be stored in ``block_aggregate`` for all threads. + //! + //! @endrst + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] input + //! Calling thread's input item + //! + //! @param[out] output + //! Calling thread's output item (may be aliased to `input`) + //! + //! @param[in] initial_value + //! Initial value to seed the invlusive scan + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[out] block_aggregate + //! Block-wide aggregate reduction of input items + template + _CCCL_DEVICE _CCCL_FORCEINLINE void + InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate) + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` @@ -2283,6 +2367,83 @@ public: } } + //! @} end member group + //! @name Inclusive prefix scan operations (multiple data per thread) + //! @{ + + //! @rst + //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. + //! Each thread contributes an array of consecutive input elements. + //! + //! - Supports non-commutative scan operators. + //! - @blocked + //! - @granularity + //! - @smemreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + //! where each thread owns 4 consecutive items. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(...) + //! { + //! // Specialize BlockScan for a 1D block of 128 threads of type int + //! typedef cub::BlockScan BlockScan; + //! + //! // Allocate shared memory for BlockScan + //! __shared__ typename BlockScan::TempStorage temp_storage; + //! + //! // Obtain a segment of consecutive items that are blocked across threads + //! int thread_data[4]; + //! ... + //! + //! // Collectively compute the block-wide inclusive prefix max scan + //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + //! + //! Suppose the set of input ``thread_data`` across the block of threads is + //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. + //! The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. + //! + //! @endrst + //! + //! @tparam ITEMS_PER_THREAD + //! **[inferred]** The number of consecutive items partitioned onto each thread. + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] input + //! Calling thread's input items + //! + //! @param[out] output + //! Calling thread's output items (may be aliased to `input`) + //! + //! @param[in] initial_value + //! Initial value to seed the invlusive scan + //! + //! @param[in] scan_op + //! Binary scan functor + template + _CCCL_DEVICE _CCCL_FORCEINLINE void + InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op) + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread @@ -2366,6 +2527,85 @@ public: } } + //! @rst + //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. + //! Each thread contributes an array of consecutive input elements. Also provides every thread + //! with the block-wide ``block_aggregate`` of all inputs. + //! + //! - Supports non-commutative scan operators. + //! - @blocked + //! - @granularity + //! - @smemreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + //! where each thread owns 4 consecutive items. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(...) + //! { + //! // Specialize BlockScan for a 1D block of 128 threads of type int + //! typedef cub::BlockScan BlockScan; + //! + //! // Allocate shared memory for BlockScan + //! __shared__ typename BlockScan::TempStorage temp_storage; + //! + //! // Obtain a segment of consecutive items that are blocked across threads + //! int thread_data[4]; + //! ... + //! + //! // Collectively compute the block-wide inclusive prefix max scan + //! int block_aggregate; + //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + //! + //! Suppose the set of input ``thread_data`` across the block of threads is + //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. + //! The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. + //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads. + //! + //! @endrst + //! + //! @tparam ITEMS_PER_THREAD + //! **[inferred]** The number of consecutive items partitioned onto each thread. + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] input + //! Calling thread's input items + //! + //! @param[out] output + //! Calling thread's output items (may be aliased to `input`) + //! + //! @param[in] initial_value + //! Initial value to seed the invlusive scan + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[out] block_aggregate + //! Block-wide aggregate reduction of input items + template + _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate) + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. diff --git a/cub/test/catch2_test_block_scan.cu b/cub/test/catch2_test_block_scan.cu index 0bd2e33fc6..d01d1b2d7c 100644 --- a/cub/test/catch2_test_block_scan.cu +++ b/cub/test/catch2_test_block_scan.cu @@ -533,8 +533,6 @@ CUB_TEST("Block custom op scan works with initial value", "[scan][block]", algor const type initial_value = static_cast(GENERATE_COPY(take(2, random(0, tile_size)))); - const int target_thread_id = GENERATE_COPY(take(2, random(0, threads_in_block - 1))); - block_scan( d_in, d_out, min_init_value_op_t{initial_value});