Skip to content

Commit 1c797e9

Browse files
committed
Move more BlockScan examples to seperate executable and use literalinclude
1 parent 4d6db76 commit 1c797e9

File tree

2 files changed

+175
-186
lines changed

2 files changed

+175
-186
lines changed

cub/cub/block/block_scan.cuh

Lines changed: 45 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -386,56 +386,17 @@ public:
386386
//! prefix functor to maintain a running total between block-wide scans. Each tile consists
387387
//! of 128 integer items that are partitioned across 128 threads.
388388
//!
389-
//! .. code-block:: c++
390-
//!
391-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
392-
//!
393-
//! // A stateful callback functor that maintains a running prefix to be applied
394-
//! // during consecutive scan operations.
395-
//! struct BlockPrefixCallbackOp
396-
//! {
397-
//! // Running prefix
398-
//! int running_total;
399-
//!
400-
//! // Constructor
401-
//! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
402-
//!
403-
//! // Callback operator to be entered by the first warp of threads in the block.
404-
//! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
405-
//! __device__ int operator()(int block_aggregate)
406-
//! {
407-
//! int old_prefix = running_total;
408-
//! running_total += block_aggregate;
409-
//! return old_prefix;
410-
//! }
411-
//! };
412-
//!
413-
//! __global__ void ExampleKernel(int *d_data, int num_items, ...)
414-
//! {
415-
//! // Specialize BlockScan for a 1D block of 128 threads
416-
//! using BlockScan = cub::BlockScan<int, 128>;
417-
//!
418-
//! // Allocate shared memory for BlockScan
419-
//! __shared__ typename BlockScan::TempStorage temp_storage;
420-
//!
421-
//! // Initialize running total
422-
//! BlockPrefixCallbackOp prefix_op(0);
423-
//!
424-
//! // Have the block iterate over segments of items
425-
//! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
426-
//! {
427-
//! // Load a segment of consecutive items that are blocked across threads
428-
//! int thread_data = d_data[block_offset + threadIdx.x];
429-
//!
430-
//! // Collectively compute the block-wide exclusive prefix sum
431-
//! BlockScan(temp_storage).ExclusiveSum(
432-
//! thread_data, thread_data, prefix_op);
433-
//! __syncthreads();
389+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
390+
//! :language: c++
391+
//! :dedent:
392+
//! :start-after: example-begin block-prefix-callback-op
393+
//! :end-before: example-end block-prefix-callback-op
434394
//!
435-
//! // Store scanned items to output segment
436-
//! d_data[block_offset + threadIdx.x] = thread_data;
437-
//! }
438-
//! }
395+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
396+
//! :language: c++
397+
//! :dedent:
398+
//! :start-after: example-begin exclusive-sum-single-prefix-callback
399+
//! :end-before: example-end exclusive-sum-single-prefix-callback
439400
//!
440401
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
441402
//! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -484,25 +445,11 @@ public:
484445
//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
485446
//! where each thread owns 4 consecutive items.
486447
//!
487-
//! .. code-block:: c++
488-
//!
489-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
490-
//!
491-
//! __global__ void ExampleKernel(...)
492-
//! {
493-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
494-
//! using BlockScan = cub::BlockScan<int, 128>;
495-
//!
496-
//! // Allocate shared memory for BlockScan
497-
//! __shared__ typename BlockScan::TempStorage temp_storage;
498-
//!
499-
//! // Obtain a segment of consecutive items that are blocked across threads
500-
//! int thread_data[4];
501-
//! ...
502-
//!
503-
//! // Collectively compute the block-wide exclusive prefix sum
504-
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
505-
//! }
448+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
449+
//! :language: c++
450+
//! :dedent:
451+
//! :start-after: example-begin exclusive-sum-array
452+
//! :end-before: example-end exclusive-sum-array
506453
//!
507454
//! Suppose the set of input ``thread_data`` across the block of threads is
508455
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -545,26 +492,11 @@ public:
545492
//! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
546493
//! 4 consecutive items.
547494
//!
548-
//! .. code-block:: c++
549-
//!
550-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
551-
//!
552-
//! __global__ void ExampleKernel(...)
553-
//! {
554-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
555-
//! using BlockScan = cub::BlockScan<int, 128>;
556-
//!
557-
//! // Allocate shared memory for BlockScan
558-
//! __shared__ typename BlockScan::TempStorage temp_storage;
559-
//!
560-
//! // Obtain a segment of consecutive items that are blocked across threads
561-
//! int thread_data[4];
562-
//! ...
563-
//!
564-
//! // Collectively compute the block-wide exclusive prefix sum
565-
//! int block_aggregate;
566-
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
567-
//! }
495+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
496+
//! :language: c++
497+
//! :dedent:
498+
//! :start-after: example-begin exclusive-sum-array-aggregate
499+
//! :end-before: example-end exclusive-sum-array-aggregate
568500
//!
569501
//! Suppose the set of input ``thread_data`` across the block of threads is
570502
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -681,25 +613,11 @@ public:
681613
//! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
682614
//! are partitioned across 128 threads.
683615
//!
684-
//! .. code-block:: c++
685-
//!
686-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
687-
//!
688-
//! __global__ void ExampleKernel(...)
689-
//! {
690-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
691-
//! using BlockScan = cub::BlockScan<int, 128>;
692-
//!
693-
//! // Allocate shared memory for BlockScan
694-
//! __shared__ typename BlockScan::TempStorage temp_storage;
695-
//!
696-
//! // Obtain input item for each thread
697-
//! int thread_data;
698-
//! ...
699-
//!
700-
//! // Collectively compute the block-wide exclusive prefix max scan
701-
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
702-
//! }
616+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
617+
//! :language: c++
618+
//! :dedent:
619+
//! :start-after: example-begin exclusive-scan-single
620+
//! :end-before: example-end exclusive-scan-single
703621
//!
704622
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
705623
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -1338,25 +1256,11 @@ public:
13381256
//! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
13391257
//! are partitioned across 128 threads.
13401258
//!
1341-
//! .. code-block:: c++
1342-
//!
1343-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1344-
//!
1345-
//! __global__ void ExampleKernel(...)
1346-
//! {
1347-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
1348-
//! using BlockScan = cub::BlockScan<int, 128>;
1349-
//!
1350-
//! // Allocate shared memory for BlockScan
1351-
//! __shared__ typename BlockScan::TempStorage temp_storage;
1352-
//!
1353-
//! // Obtain input item for each thread
1354-
//! int thread_data;
1355-
//! ...
1356-
//!
1357-
//! // Collectively compute the block-wide inclusive prefix sum
1358-
//! int block_aggregate;
1359-
//! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1259+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
1260+
//! :language: c++
1261+
//! :dedent:
1262+
//! :start-after: example-begin inclusive-sum-single-aggregate
1263+
//! :end-before: example-end inclusive-sum-single-aggregate
13601264
//!
13611265
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
13621266
//! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
@@ -1550,25 +1454,11 @@ public:
15501454
//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
15511455
//! where each thread owns 4 consecutive items.
15521456
//!
1553-
//! .. code-block:: c++
1554-
//!
1555-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1556-
//!
1557-
//! __global__ void ExampleKernel(...)
1558-
//! {
1559-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
1560-
//! using BlockScan = cub::BlockScan<int, 128>;
1561-
//!
1562-
//! // Allocate shared memory for BlockScan
1563-
//! __shared__ typename BlockScan::TempStorage temp_storage;
1564-
//!
1565-
//! // Obtain a segment of consecutive items that are blocked across threads
1566-
//! int thread_data[4];
1567-
//! ...
1568-
//!
1569-
//! // Collectively compute the block-wide inclusive prefix sum
1570-
//! int block_aggregate;
1571-
//! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1457+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
1458+
//! :language: c++
1459+
//! :dedent:
1460+
//! :start-after: example-begin inclusive-sum-array-aggregate
1461+
//! :end-before: example-end inclusive-sum-array-aggregate
15721462
//!
15731463
//! Suppose the set of input ``thread_data`` across the block of threads is
15741464
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
@@ -1710,24 +1600,11 @@ public:
17101600
//! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
17111601
//! are partitioned across 128 threads.
17121602
//!
1713-
//! .. code-block:: c++
1714-
//!
1715-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1716-
//!
1717-
//! __global__ void ExampleKernel(...)
1718-
//! {
1719-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
1720-
//! using BlockScan = cub::BlockScan<int, 128>;
1721-
//!
1722-
//! // Allocate shared memory for BlockScan
1723-
//! __shared__ typename BlockScan::TempStorage temp_storage;
1724-
//!
1725-
//! // Obtain input item for each thread
1726-
//! int thread_data;
1727-
//! ...
1728-
//!
1729-
//! // Collectively compute the block-wide inclusive prefix max scan
1730-
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1603+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
1604+
//! :language: c++
1605+
//! :dedent:
1606+
//! :start-after: example-begin inclusive-scan-single
1607+
//! :end-before: example-end inclusive-scan-single
17311608
//!
17321609
//! Suppose the set of input ``thread_data`` across the block of threads is
17331610
//! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
@@ -1903,24 +1780,11 @@ public:
19031780
//! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
19041781
//! where each thread owns 4 consecutive items.
19051782
//!
1906-
//! .. code-block:: c++
1907-
//!
1908-
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1909-
//!
1910-
//! __global__ void ExampleKernel(...)
1911-
//! {
1912-
//! // Specialize BlockScan for a 1D block of 128 threads of type int
1913-
//! using BlockScan = cub::BlockScan<int, 128>;
1914-
//!
1915-
//! // Allocate shared memory for BlockScan
1916-
//! __shared__ typename BlockScan::TempStorage temp_storage;
1917-
//!
1918-
//! // Obtain a segment of consecutive items that are blocked across threads
1919-
//! int thread_data[4];
1920-
//! ...
1921-
//!
1922-
//! // Collectively compute the block-wide inclusive prefix max scan
1923-
//! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1783+
//! .. literalinclude:: ../../examples/block/example_block_scan.cu
1784+
//! :language: c++
1785+
//! :dedent:
1786+
//! :start-after: example-begin inclusive-scan-array
1787+
//! :end-before: example-end inclusive-scan-array
19241788
//!
19251789
//! Suppose the set of input ``thread_data`` across the block of threads is
19261790
//! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.

0 commit comments

Comments
 (0)