Skip to content

Commit

Permalink
Add docs for block scan inclusive scan
Browse files Browse the repository at this point in the history
  • Loading branch information
gonidelis committed May 20, 2024
1 parent f3142fa commit ce4b01b
Show file tree
Hide file tree
Showing 2 changed files with 281 additions and 43 deletions.
322 changes: 281 additions & 41 deletions cub/cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -819,12 +819,6 @@ public:
InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
}

template <typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
{
InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op);
}

//! @rst
//! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element.
Expand Down Expand Up @@ -892,13 +886,6 @@ public:
InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
}

template <typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void
InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
{
InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op, block_aggregate);
}

//! @rst
//! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
Expand Down Expand Up @@ -1085,20 +1072,6 @@ public:
internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
}

template <int ITEMS_PER_THREAD, typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void
InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
{
// Reduce consecutive thread items in registers
T thread_prefix = internal::ThreadReduce(input, scan_op);

// Exclusive thread block-scan
ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);

// Exclusive scan in registers with prefix as seed
internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
}

//! @rst
//! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements.
Expand Down Expand Up @@ -1180,20 +1153,6 @@ public:
internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
}

template <int ITEMS_PER_THREAD, typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
{
// Reduce consecutive thread items in registers
T thread_prefix = internal::ThreadReduce(input, scan_op);

// Exclusive thread block-scan
ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);

// Exclusive scan in registers with prefix as seed
internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
}

//! @rst
//! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements.
Expand Down Expand Up @@ -2031,6 +1990,65 @@ public:
InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element.
//!
//! - Supports non-commutative scan operators.
//! - @rowmajor
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
//! are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! typedef cub::BlockScan<int, 128> BlockScan;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain input item for each thread
//! int thread_data;
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
//! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
//!
//! @endrst
//!
//! @tparam ScanOp
//! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
//!
//! @param[in] input
//! Calling thread's input item
//!
//! @param[out] output
//! Calling thread's output item (may be aliased to `input`)
//!
//! @param[in] initial_value
//! Initial value to seed the invlusive scan
//!
//! @param[in] scan_op
//! Binary scan functor
template <typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
{
InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element. Also provides every thread with the block-wide
Expand Down Expand Up @@ -2093,6 +2111,72 @@ public:
InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element. Also provides every thread with the block-wide
//! ``block_aggregate`` of all inputs.
//!
//! - Supports non-commutative scan operators.
//! - @rowmajor
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates an inclusive prefix max scan of 128
//! integer items that are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! typedef cub::BlockScan<int, 128> BlockScan;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain input item for each thread
//! int thread_data;
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! int block_aggregate;
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
//! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
//! ``126`` will be stored in ``block_aggregate`` for all threads.
//!
//! @endrst
//!
//! @tparam ScanOp
//! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
//!
//! @param[in] input
//! Calling thread's input item
//!
//! @param[out] output
//! Calling thread's output item (may be aliased to `input`)
//!
//! @param[in] initial_value
//! Initial value to seed the invlusive scan
//!
//! @param[in] scan_op
//! Binary scan functor
//!
//! @param[out] block_aggregate
//! Block-wide aggregate reduction of input items
template <typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void
InclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
{
InternalBlockScan(temp_storage).InclusiveScan(input, output, initial_value, scan_op, block_aggregate);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
Expand Down Expand Up @@ -2283,6 +2367,83 @@ public:
}
}

//! @} end member group
//! @name Inclusive prefix scan operations (multiple data per thread)
//! @{

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements.
//!
//! - Supports non-commutative scan operators.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
//! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! typedef cub::BlockScan<int, 128> BlockScan;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
//! The corresponding output ``thread_data`` in those threads will be
//! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
//!
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam ScanOp
//! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[out] output
//! Calling thread's output items (may be aliased to `input`)
//!
//! @param[in] initial_value
//! Initial value to seed the invlusive scan
//!
//! @param[in] scan_op
//! Binary scan functor
template <int ITEMS_PER_THREAD, typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void
InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
{
// Reduce consecutive thread items in registers
T thread_prefix = internal::ThreadReduce(input, scan_op);

// Exclusive thread block-scan
ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);

// Exclusive scan in registers with prefix as seed
internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements. Also provides every thread
Expand Down Expand Up @@ -2366,6 +2527,85 @@ public:
}
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements. Also provides every thread
//! with the block-wide ``block_aggregate`` of all inputs.
//!
//! - Supports non-commutative scan operators.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
//! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! typedef cub::BlockScan<int, 128> BlockScan;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! int block_aggregate;
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
//! The corresponding output ``thread_data`` in those threads will be
//! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
//! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
//!
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam ScanOp
//! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[out] output
//! Calling thread's output items (may be aliased to `input`)
//!
//! @param[in] initial_value
//! Initial value to seed the invlusive scan
//!
//! @param[in] scan_op
//! Binary scan functor
//!
//! @param[out] block_aggregate
//! Block-wide aggregate reduction of input items
template <int ITEMS_PER_THREAD, typename ScanOp>
_CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
{
// Reduce consecutive thread items in registers
T thread_prefix = internal::ThreadReduce(input, scan_op);

// Exclusive thread block-scan
ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);

// Exclusive scan in registers with prefix as seed
internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
}

//! @rst
//! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
//! Each thread contributes an array of consecutive input elements.
Expand Down
2 changes: 0 additions & 2 deletions cub/test/catch2_test_block_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,6 @@ CUB_TEST("Block custom op scan works with initial value", "[scan][block]", algor

const type initial_value = static_cast<type>(GENERATE_COPY(take(2, random(0, tile_size))));

const int target_thread_id = GENERATE_COPY(take(2, random(0, threads_in_block - 1)));

block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
d_in, d_out, min_init_value_op_t<type, mode>{initial_value});

Expand Down

0 comments on commit ce4b01b

Please sign in to comment.