@@ -386,56 +386,17 @@ public:
386386 // ! prefix functor to maintain a running total between block-wide scans. Each tile consists
387387 // ! of 128 integer items that are partitioned across 128 threads.
388388 // !
389- // ! .. code-block:: c++
390- // !
391- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
392- // !
393- // ! // A stateful callback functor that maintains a running prefix to be applied
394- // ! // during consecutive scan operations.
395- // ! struct BlockPrefixCallbackOp
396- // ! {
397- // ! // Running prefix
398- // ! int running_total;
399- // !
400- // ! // Constructor
401- // ! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
402- // !
403- // ! // Callback operator to be entered by the first warp of threads in the block.
404- // ! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
405- // ! __device__ int operator()(int block_aggregate)
406- // ! {
407- // ! int old_prefix = running_total;
408- // ! running_total += block_aggregate;
409- // ! return old_prefix;
410- // ! }
411- // ! };
412- // !
413- // ! __global__ void ExampleKernel(int *d_data, int num_items, ...)
414- // ! {
415- // ! // Specialize BlockScan for a 1D block of 128 threads
416- // ! using BlockScan = cub::BlockScan<int, 128>;
417- // !
418- // ! // Allocate shared memory for BlockScan
419- // ! __shared__ typename BlockScan::TempStorage temp_storage;
420- // !
421- // ! // Initialize running total
422- // ! BlockPrefixCallbackOp prefix_op(0);
423- // !
424- // ! // Have the block iterate over segments of items
425- // ! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
426- // ! {
427- // ! // Load a segment of consecutive items that are blocked across threads
428- // ! int thread_data = d_data[block_offset + threadIdx.x];
429- // !
430- // ! // Collectively compute the block-wide exclusive prefix sum
431- // ! BlockScan(temp_storage).ExclusiveSum(
432- // ! thread_data, thread_data, prefix_op);
433- // ! __syncthreads();
389+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
390+ // ! :language: c++
391+ // ! :dedent:
392+ // ! :start-after: example-begin block-prefix-callback-op
393+ // ! :end-before: example-end block-prefix-callback-op
434394 // !
435- // ! // Store scanned items to output segment
436- // ! d_data[block_offset + threadIdx.x] = thread_data;
437- // ! }
438- // ! }
395+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
396+ // ! :language: c++
397+ // ! :dedent:
398+ // ! :start-after: example-begin exclusive-sum-single-prefix-callback
399+ // ! :end-before: example-end exclusive-sum-single-prefix-callback
439400 // !
440401 // ! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
441402 // ! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -484,25 +445,11 @@ public:
484445 // ! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
485446 // ! where each thread owns 4 consecutive items.
486447 // !
487- // ! .. code-block:: c++
488- // !
489- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
490- // !
491- // ! __global__ void ExampleKernel(...)
492- // ! {
493- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
494- // ! using BlockScan = cub::BlockScan<int, 128>;
495- // !
496- // ! // Allocate shared memory for BlockScan
497- // ! __shared__ typename BlockScan::TempStorage temp_storage;
498- // !
499- // ! // Obtain a segment of consecutive items that are blocked across threads
500- // ! int thread_data[4];
501- // ! ...
502- // !
503- // ! // Collectively compute the block-wide exclusive prefix sum
504- // ! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
505- // ! }
448+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
449+ // ! :language: c++
450+ // ! :dedent:
451+ // ! :start-after: example-begin exclusive-sum-array
452+ // ! :end-before: example-end exclusive-sum-array
506453 // !
507454 // ! Suppose the set of input ``thread_data`` across the block of threads is
508455 // ! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -545,26 +492,11 @@ public:
545492 // ! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
546493 // ! 4 consecutive items.
547494 // !
548- // ! .. code-block:: c++
549- // !
550- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
551- // !
552- // ! __global__ void ExampleKernel(...)
553- // ! {
554- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
555- // ! using BlockScan = cub::BlockScan<int, 128>;
556- // !
557- // ! // Allocate shared memory for BlockScan
558- // ! __shared__ typename BlockScan::TempStorage temp_storage;
559- // !
560- // ! // Obtain a segment of consecutive items that are blocked across threads
561- // ! int thread_data[4];
562- // ! ...
563- // !
564- // ! // Collectively compute the block-wide exclusive prefix sum
565- // ! int block_aggregate;
566- // ! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
567- // ! }
495+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
496+ // ! :language: c++
497+ // ! :dedent:
498+ // ! :start-after: example-begin exclusive-sum-array-aggregate
499+ // ! :end-before: example-end exclusive-sum-array-aggregate
568500 // !
569501 // ! Suppose the set of input ``thread_data`` across the block of threads is
570502 // ! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -681,25 +613,11 @@ public:
681613 // ! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
682614 // ! are partitioned across 128 threads.
683615 // !
684- // ! .. code-block:: c++
685- // !
686- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
687- // !
688- // ! __global__ void ExampleKernel(...)
689- // ! {
690- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
691- // ! using BlockScan = cub::BlockScan<int, 128>;
692- // !
693- // ! // Allocate shared memory for BlockScan
694- // ! __shared__ typename BlockScan::TempStorage temp_storage;
695- // !
696- // ! // Obtain input item for each thread
697- // ! int thread_data;
698- // ! ...
699- // !
700- // ! // Collectively compute the block-wide exclusive prefix max scan
701- // ! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
702- // ! }
616+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
617+ // ! :language: c++
618+ // ! :dedent:
619+ // ! :start-after: example-begin exclusive-scan-single
620+ // ! :end-before: example-end exclusive-scan-single
703621 // !
704622 // ! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
705623 // ! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -1338,25 +1256,11 @@ public:
13381256 // ! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
13391257 // ! are partitioned across 128 threads.
13401258 // !
1341- // ! .. code-block:: c++
1342- // !
1343- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1344- // !
1345- // ! __global__ void ExampleKernel(...)
1346- // ! {
1347- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
1348- // ! using BlockScan = cub::BlockScan<int, 128>;
1349- // !
1350- // ! // Allocate shared memory for BlockScan
1351- // ! __shared__ typename BlockScan::TempStorage temp_storage;
1352- // !
1353- // ! // Obtain input item for each thread
1354- // ! int thread_data;
1355- // ! ...
1356- // !
1357- // ! // Collectively compute the block-wide inclusive prefix sum
1358- // ! int block_aggregate;
1359- // ! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1259+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
1260+ // ! :language: c++
1261+ // ! :dedent:
1262+ // ! :start-after: example-begin inclusive-sum-single-aggregate
1263+ // ! :end-before: example-end inclusive-sum-single-aggregate
13601264 // !
13611265 // ! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
13621266 // ! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
@@ -1550,25 +1454,11 @@ public:
15501454 // ! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
15511455 // ! where each thread owns 4 consecutive items.
15521456 // !
1553- // ! .. code-block:: c++
1554- // !
1555- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1556- // !
1557- // ! __global__ void ExampleKernel(...)
1558- // ! {
1559- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
1560- // ! using BlockScan = cub::BlockScan<int, 128>;
1561- // !
1562- // ! // Allocate shared memory for BlockScan
1563- // ! __shared__ typename BlockScan::TempStorage temp_storage;
1564- // !
1565- // ! // Obtain a segment of consecutive items that are blocked across threads
1566- // ! int thread_data[4];
1567- // ! ...
1568- // !
1569- // ! // Collectively compute the block-wide inclusive prefix sum
1570- // ! int block_aggregate;
1571- // ! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1457+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
1458+ // ! :language: c++
1459+ // ! :dedent:
1460+ // ! :start-after: example-begin inclusive-sum-array-aggregate
1461+ // ! :end-before: example-end inclusive-sum-array-aggregate
15721462 // !
15731463 // ! Suppose the set of input ``thread_data`` across the block of threads is
15741464 // ! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
@@ -1710,24 +1600,11 @@ public:
17101600 // ! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
17111601 // ! are partitioned across 128 threads.
17121602 // !
1713- // ! .. code-block:: c++
1714- // !
1715- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1716- // !
1717- // ! __global__ void ExampleKernel(...)
1718- // ! {
1719- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
1720- // ! using BlockScan = cub::BlockScan<int, 128>;
1721- // !
1722- // ! // Allocate shared memory for BlockScan
1723- // ! __shared__ typename BlockScan::TempStorage temp_storage;
1724- // !
1725- // ! // Obtain input item for each thread
1726- // ! int thread_data;
1727- // ! ...
1728- // !
1729- // ! // Collectively compute the block-wide inclusive prefix max scan
1730- // ! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1603+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
1604+ // ! :language: c++
1605+ // ! :dedent:
1606+ // ! :start-after: example-begin inclusive-scan-single
1607+ // ! :end-before: example-end inclusive-scan-single
17311608 // !
17321609 // ! Suppose the set of input ``thread_data`` across the block of threads is
17331610 // ! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
@@ -1903,24 +1780,11 @@ public:
19031780 // ! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
19041781 // ! where each thread owns 4 consecutive items.
19051782 // !
1906- // ! .. code-block:: c++
1907- // !
1908- // ! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1909- // !
1910- // ! __global__ void ExampleKernel(...)
1911- // ! {
1912- // ! // Specialize BlockScan for a 1D block of 128 threads of type int
1913- // ! using BlockScan = cub::BlockScan<int, 128>;
1914- // !
1915- // ! // Allocate shared memory for BlockScan
1916- // ! __shared__ typename BlockScan::TempStorage temp_storage;
1917- // !
1918- // ! // Obtain a segment of consecutive items that are blocked across threads
1919- // ! int thread_data[4];
1920- // ! ...
1921- // !
1922- // ! // Collectively compute the block-wide inclusive prefix max scan
1923- // ! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1783+ // ! .. literalinclude:: ../../examples/block/example_block_scan.cu
1784+ // ! :language: c++
1785+ // ! :dedent:
1786+ // ! :start-after: example-begin inclusive-scan-array
1787+ // ! :end-before: example-end inclusive-scan-array
19241788 // !
19251789 // ! Suppose the set of input ``thread_data`` across the block of threads is
19261790 // ! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
0 commit comments