Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 45 additions & 181 deletions cub/cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -362,56 +362,17 @@ public:
//! prefix functor to maintain a running total between block-wide scans. Each tile consists
//! of 128 integer items that are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! // A stateful callback functor that maintains a running prefix to be applied
//! // during consecutive scan operations.
//! struct BlockPrefixCallbackOp
//! {
//! // Running prefix
//! int running_total;
//!
//! // Constructor
//! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
//!
//! // Callback operator to be entered by the first warp of threads in the block.
//! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
//! __device__ int operator()(int block_aggregate)
//! {
//! int old_prefix = running_total;
//! running_total += block_aggregate;
//! return old_prefix;
//! }
//! };
//!
//! __global__ void ExampleKernel(int *d_data, int num_items, ...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Initialize running total
//! BlockPrefixCallbackOp prefix_op(0);
//!
//! // Have the block iterate over segments of items
//! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
//! {
//! // Load a segment of consecutive items that are blocked across threads
//! int thread_data = d_data[block_offset + threadIdx.x];
//!
//! // Collectively compute the block-wide exclusive prefix sum
//! BlockScan(temp_storage).ExclusiveSum(
//! thread_data, thread_data, prefix_op);
//! __syncthreads();
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin block-prefix-callback-op
//! :end-before: example-end block-prefix-callback-op
//!
//! // Store scanned items to output segment
//! d_data[block_offset + threadIdx.x] = thread_data;
//! }
//! }
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin exclusive-sum-single-prefix-callback
//! :end-before: example-end exclusive-sum-single-prefix-callback
//!
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
//! The corresponding output for the first segment will be ``0, 1, ..., 127``.
Expand Down Expand Up @@ -460,25 +421,11 @@ public:
//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide exclusive prefix sum
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
//! }
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin exclusive-sum-array
//! :end-before: example-end exclusive-sum-array
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
Expand Down Expand Up @@ -521,26 +468,11 @@ public:
//! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
//! 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide exclusive prefix sum
//! int block_aggregate;
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
//! }
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin exclusive-sum-array-aggregate
//! :end-before: example-end exclusive-sum-array-aggregate
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
Expand Down Expand Up @@ -657,25 +589,11 @@ public:
//! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
//! are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain input item for each thread
//! int thread_data;
//! ...
//!
//! // Collectively compute the block-wide exclusive prefix max scan
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
//! }
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin exclusive-scan-single
//! :end-before: example-end exclusive-scan-single
//!
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
Expand Down Expand Up @@ -1314,25 +1232,11 @@ public:
//! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
//! are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain input item for each thread
//! int thread_data;
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix sum
//! int block_aggregate;
//! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin inclusive-sum-single-aggregate
//! :end-before: example-end inclusive-sum-single-aggregate
//!
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
//! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
Expand Down Expand Up @@ -1526,25 +1430,11 @@ public:
//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix sum
//! int block_aggregate;
//! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin inclusive-sum-array-aggregate
//! :end-before: example-end inclusive-sum-array-aggregate
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
Expand Down Expand Up @@ -1686,24 +1576,11 @@ public:
//! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
//! are partitioned across 128 threads.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain input item for each thread
//! int thread_data;
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin inclusive-scan-single
//! :end-before: example-end inclusive-scan-single
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
Expand Down Expand Up @@ -1879,24 +1756,11 @@ public:
//! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockScan for a 1D block of 128 threads of type int
//! using BlockScan = cub::BlockScan<int, 128>;
//!
//! // Allocate shared memory for BlockScan
//! __shared__ typename BlockScan::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute the block-wide inclusive prefix max scan
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin inclusive-scan-array
//! :end-before: example-end inclusive-scan-array
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
Expand Down
Loading