From b7bfd7f98c85a2439b9711413a2dae0ff6c2c271 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 21 Jun 2023 19:45:56 +0400
Subject: [PATCH] Modernize warp store docs

---
 cub/warp/warp_store.cuh | 564 ++++++++++++++++++++--------------------
 1 file changed, 289 insertions(+), 275 deletions(-)
diff --git a/cub/warp/warp_store.cuh b/cub/warp/warp_store.cuh
index acee3d01c..c125e6d12 100644
--- a/cub/warp/warp_store.cuh
+++ b/cub/warp/warp_store.cuh
@@ -25,10 +25,7 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * Operations for writing linear segments of data from the CUDA warp
- */
+//! @file Operations for writing linear segments of data from the CUDA warp
 
 #pragma once
 
@@ -45,161 +42,183 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for
- *        cub::WarpStore to write a blocked arrangement of items across a CUDA
- *        warp to a linear segment of memory.
- */
+//! @rst
+//! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore` 
+//! to write a blocked arrangement of items across a CUDA warp to a linear segment of memory.
+//! @endrst
 enum WarpStoreAlgorithm
 {
-  /**
-   * @par Overview
-   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-   * directly to memory.
-   *
-   * @par Performance Considerations
-   * The utilization of memory transactions (coalescing) decreases as the
-   * access stride between threads increases (i.e., the number items per thread).
-   */
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly 
+  //! to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) decreases as the
+  //! access stride between threads increases (i.e., the number items per thread).
+  //! @endrst
   WARP_STORE_DIRECT,
 
-  /**
-   * @par Overview
-   * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
-   * directly to memory.
-   *
-   * @par Performance Considerations
-   * The utilization of memory transactions (coalescing) remains high regardless
-   * of items written per thread.
-   */
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written
+  //! directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) remains high regardless
+  //! of items written per thread.
+  //! @endrst
   WARP_STORE_STRIPED,
 
-  /**
-   * @par Overview
-   *
-   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-   * directly to memory using CUDA's built-in vectorized stores as a coalescing
-   * optimization. For example, <tt>st.global.v4.s32</tt> instructions will be
-   * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
-   *
-   * @par Performance Considerations
-   * - The utilization of memory transactions (coalescing) remains high until
-   *   the the access stride between threads (i.e., the number items per thread)
-   *   exceeds the maximum vector store width (typically 4 items or 64B,
-   *   whichever is lower).
-   * - The following conditions will prevent vectorization and writing will fall
-   *   back to cub::WARP_STORE_DIRECT:
-   *   - @p ITEMS_PER_THREAD is odd
-   *   - The @p OutputIteratorT is not a simple pointer type
-   *   - The block output offset is not quadword-aligned
-   *   - The data type @p T is not a built-in primitive or CUDA vector type
-   *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
-   */
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //! 
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written
+  //! directly to memory using CUDA's built-in vectorized stores as a coalescing
+  //! optimization. For example, ``st.global.v4.s32`` instructions will be
+  //! generated when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //! 
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //! 
+  //! * The utilization of memory transactions (coalescing) remains high until
+  //!   the the access stride between threads (i.e., the number items per thread)
+  //!   exceeds the maximum vector store width (typically 4 items or 64B,
+  //!   whichever is lower).
+  //! * The following conditions will prevent vectorization and writing will fall
+  //!   back to ``cub::WARP_STORE_DIRECT``:
+  //!
+  //!   * ``ITEMS_PER_THREAD`` is odd
+  //!   * The ``OutputIteratorT`` is not a simple pointer type
+  //!   * The block output offset is not quadword-aligned
+  //!   * The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
   WARP_STORE_VECTORIZE,
 
-  /**
-   * @par Overview
-   * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-   * transposed and then efficiently written to memory as a
-   * [<em>striped arrangement</em>](index.html#sec5sec3).
-   *
-   * @par Performance Considerations
-   * - The utilization of memory transactions (coalescing) remains high
-   *   regardless of items written per thread.
-   * - The local reordering incurs slightly longer latencies and throughput than the
-   *   direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives.
-   */
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!  
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! * The utilization of memory transactions (coalescing) remains high
+  //!   regardless of items written per thread.
+  //! * The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct ``cub::WARP_STORE_DIRECT`` and ``cub::WARP_STORE_VECTORIZE`` alternatives.
+  //!  
+  //! @endrst
   WARP_STORE_TRANSPOSE
 };
 
 
-/**
- * @brief The WarpStore class provides [<em>collective</em>](index.html#sec0)
- *        data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *        of items partitioned across a CUDA warp to a linear segment of memory.
- * @ingroup WarpModule
- * @ingroup UtilIo
- *
- * @tparam T
- *   The type of data to be written.
- *
- * @tparam ITEMS_PER_THREAD
- *   The number of consecutive items partitioned onto each thread.
- *
- * @tparam ALGORITHM
- *   <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
- *   default: cub::WARP_STORE_DIRECT.
- *
- * @tparam LOGICAL_WARP_THREADS
- *   <b>[optional]</b> The number of threads per "logical" warp (may be less
- *   than the number of hardware warp threads). Default is the warp size of the
- *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
- *   power of two.
- *
- * @tparam LEGACY_PTX_ARCH
- *   Unused.
- *
- * @par Overview
- * - The WarpStore class provides a single data movement abstraction that can be
- *   specialized to implement different cub::WarpStoreAlgorithm strategies. This
- *   facilitates different performance policies for different architectures,
- *   data types, granularity sizes, etc.
- * - WarpStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::WARP_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
- *   -# <b>cub::WARP_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
- *   -# <b>cub::WARP_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized
- *      stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm)
- *   -# <b>cub::WARP_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3)
- *      which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm)
- * - \rowmajor
- *
- * @par A Simple Example
- * @par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory. The store is specialized for
- * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
- * that memory references will be efficiently coalesced using a warp-striped
- * access pattern.
- * @par
- * @code
- * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     constexpr int warp_threads = 16;
- *     constexpr int block_threads = 256;
- *     constexpr int items_per_thread = 4;
- *
- *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
- *     using WarpStoreT = WarpStore<int,
- *                                  items_per_thread,
- *                                  cub::WARP_STORE_TRANSPOSE,
- *                                  warp_threads>;
- *
- *     constexpr int warps_in_block = block_threads / warp_threads;
- *     constexpr int tile_size = items_per_thread * warp_threads;
- *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
- *
- *     // Allocate shared memory for WarpStore
- *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
- * @endcode
- * @par
- * Suppose the set of @p thread_data across the warp threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
- * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- */
+//! @rst
+//! The WarpStore class provides :ref:`collective <collective-primitives>` 
+//! data movement methods for writing a :ref:`blocked arrangement <flexible-data-arrangement>`
+//! of items partitioned across a CUDA warp to a linear segment of memory.
+//! 
+//! Overview
+//! ++++++++++++++++
+//!
+//! * The WarpStore class provides a single data movement abstraction that can be
+//!   specialized to implement different cub::WarpStoreAlgorithm strategies. This
+//!   facilitates different performance policies for different architectures,
+//!   data types, granularity sizes, etc.
+//! * WarpStore can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`: 
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to 
+//!      memory. 
+//!   #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`: 
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to 
+//!      memory. 
+//!   #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`: 
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to 
+//!      memory using CUDA's built-in vectorized stores as a coalescing optimization. 
+//!   #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`: 
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into a 
+//!      :ref:`striped arrangement <flexible-data-arrangement>` which is then written to memory. 
+//!
+//! * @rowmajor
+//! 
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates the storing of a "blocked" arrangement
+//! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+//! into a linear segment of memory. The store is specialized for
+//! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+//! that memory references will be efficiently coalesced using a warp-striped
+//! access pattern.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+//! 
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        constexpr int warp_threads = 16;
+//!        constexpr int block_threads = 256;
+//!        constexpr int items_per_thread = 4;
+//! 
+//!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+//!        using WarpStoreT = WarpStore<int,
+//!                                     items_per_thread,
+//!                                     cub::WARP_STORE_TRANSPOSE,
+//!                                     warp_threads>;
+//! 
+//!        constexpr int warps_in_block = block_threads / warp_threads;
+//!        constexpr int tile_size = items_per_thread * warp_threads;
+//!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+//! 
+//!        // Allocate shared memory for WarpStore
+//!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+//! 
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//! 
+//!        // Store items to linear memory
+//!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+//!
+//! Suppose the set of ``thread_data`` across the warp threads is
+//! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+//! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+//! @endrst
+//! 
+//! @tparam T
+//!   The type of data to be written.
+//! 
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//! 
+//! @tparam ALGORITHM
+//!   <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
+//!   default: cub::WARP_STORE_DIRECT.
+//! 
+//! @tparam LOGICAL_WARP_THREADS
+//!   <b>[optional]</b> The number of threads per "logical" warp (may be less
+//!   than the number of hardware warp threads). Default is the warp size of the
+//!   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+//!   power of two.
+//! 
+//! @tparam LEGACY_PTX_ARCH
+//!   Unused.
 template <typename           T,
           int                ITEMS_PER_THREAD,
           WarpStoreAlgorithm ALGORITHM            = WARP_STORE_DIRECT,
@@ -221,7 +240,7 @@ private:
   template <int DUMMY>
   struct StoreInternal<WARP_STORE_DIRECT, DUMMY>
   {
-    typedef NullType TempStorage;
+    using TempStorage = NullType;
 
     int linear_tid;
 
@@ -250,7 +269,7 @@ private:
   template <int DUMMY>
   struct StoreInternal<WARP_STORE_STRIPED, DUMMY>
   {
-    typedef NullType TempStorage;
+    using TempStorage = NullType;
 
     int linear_tid;
 
@@ -282,7 +301,7 @@ private:
   template <int DUMMY>
   struct StoreInternal<WARP_STORE_VECTORIZE, DUMMY>
   {
-    typedef NullType TempStorage;
+    using TempStorage = NullType;
 
     int linear_tid;
 
@@ -379,86 +398,79 @@ public:
 
   struct TempStorage : Uninitialized<_TempStorage> {};
 
-  /*************************************************************************//**
-   * @name Collective constructors
-   ****************************************************************************/
-  //@{
+  //! @name Collective constructors
+  //! @{
 
-  /**
-   * @brief Collective constructor using a private static allocation of shared
-   *        memory as temporary storage.
-   */
+  //! @brief Collective constructor using a private static allocation of shared
+  //!        memory as temporary storage.
   __device__ __forceinline__ WarpStore()
       : temp_storage(PrivateStorage())
       , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
   {}
 
-  /**
-   * @brief Collective constructor using the specified memory allocation as
-   *        temporary storage.
-   */
+  //! @brief Collective constructor using the specified memory allocation as
+  //!        temporary storage.
   __device__ __forceinline__ WarpStore(TempStorage &temp_storage)
       : temp_storage(temp_storage.Alias())
       , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
   {}
 
-  //@}  end member group
-  /*************************************************************************//**
-   * @name Data movement
-   ****************************************************************************/
-  //@{
-
-  /**
-   * @brief Store items into a linear segment of memory.
-   *
-   * @par
-   * @smemwarpreuse
-   *
-   * @par Snippet
-   * @par
-   * The code snippet below illustrates the storing of a "blocked" arrangement
-   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
-   * into a linear segment of memory. The store is specialized for
-   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
-   * that memory references will be efficiently coalesced using a warp-striped
-   * access pattern.
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
-   *
-   * __global__ void ExampleKernel(int *d_data, ...)
-   * {
-   *     constexpr int warp_threads = 16;
-   *     constexpr int block_threads = 256;
-   *     constexpr int items_per_thread = 4;
-   *
-   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
-   *     using WarpStoreT = WarpStore<int,
-   *                                  items_per_thread,
-   *                                  cub::WARP_STORE_TRANSPOSE,
-   *                                  warp_threads>;
-   *
-   *     constexpr int warps_in_block = block_threads / warp_threads;
-   *     constexpr int tile_size = items_per_thread * warp_threads;
-   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
-   *
-   *     // Allocate shared memory for WarpStore
-   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
-   *
-   *     // Obtain a segment of consecutive items that are blocked across threads
-   *     int thread_data[4];
-   *     ...
-   *
-   *     // Store items to linear memory
-   *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
-   * @endcode
-   * @par
-   * Suppose the set of @p thread_data across the warp threads is
-   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
-   * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-   *
-   * @param[out] block_itr The thread block's base output iterator for storing to
-   * @param[in] items Data to store
-   */
+  //! @}  end member group
+  //! @name Data movement
+  //! @{
+
+  //! @rst
+  //! Store items into a linear segment of memory.
+  //! 
+  //! @smemwarpreuse
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the storing of a "blocked" arrangement
+  //! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for
+  //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+  //! that memory references will be efficiently coalesced using a warp-striped
+  //! access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+  //!    
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!    
+  //!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+  //!        using WarpStoreT = WarpStore<int,
+  //!                                     items_per_thread,
+  //!                                     cub::WARP_STORE_TRANSPOSE,
+  //!                                     warp_threads>;
+  //!    
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!    
+  //!        // Allocate shared memory for WarpStore
+  //!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+  //!    
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!    
+  //!        // Store items to linear memory
+  //!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+  //!
+  //! Suppose the set of ``thread_data`` across the warp threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+  //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+  //! @endrst
+  //! 
+  //! @param[out] block_itr The thread block's base output iterator for storing to
+  //! @param[in] items Data to store
   template <typename OutputIteratorT>
   __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                         T (&items)[ITEMS_PER_THREAD])
@@ -466,61 +478,63 @@ public:
     InternalStore(temp_storage, linear_tid).Store(block_itr, items);
   }
 
-  /**
-   * @brief Store items into a linear segment of memory, guarded by range.
-   *
-   * @par
-   * @smemwarpreuse
-   *
-   * @par Snippet
-   * @par
-   * The code snippet below illustrates the storing of a "blocked" arrangement
-   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
-   * into a linear segment of memory. The store is specialized for
-   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
-   * that memory references will be efficiently coalesced using a warp-striped
-   * access pattern.
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
-   *
-   * __global__ void ExampleKernel(int *d_data, int valid_items ...)
-   * {
-   *     constexpr int warp_threads = 16;
-   *     constexpr int block_threads = 256;
-   *     constexpr int items_per_thread = 4;
-   *
-   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
-   *     using WarpStoreT = WarpStore<int,
-   *                                  items_per_thread,
-   *                                  cub::WARP_STORE_TRANSPOSE,
-   *                                  warp_threads>;
-   *
-   *     constexpr int warps_in_block = block_threads / warp_threads;
-   *     constexpr int tile_size = items_per_thread * warp_threads;
-   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
-   *
-   *     // Allocate shared memory for WarpStore
-   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
-   *
-   *     // Obtain a segment of consecutive items that are blocked across threads
-   *     int thread_data[4];
-   *     ...
-   *
-   *     // Store items to linear memory
-   *     WarpStoreT(temp_storage[warp_id]).Store(
-   *       d_data + warp_id * tile_size, thread_data, valid_items);
-   * @endcode
-   * @par
-   * Suppose the set of @p thread_data across the warp threads is
-   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt> and @p valid_items
-   * is @p 5.. The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ...</tt>,
-   * with only the first two threads being unmasked to store portions of valid
-   * data.
-   *
-   * @param[out] block_itr The thread block's base output iterator for storing to
-   * @param[in] items Data to store
-   * @param[in] valid_items Number of valid items to write
-   */
+  //! @rst
+  //! Store items into a linear segment of memory, guarded by range.
+  //! 
+  //! @smemwarpreuse
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the storing of a "blocked" arrangement
+  //! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for
+  //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+  //! that memory references will be efficiently coalesced using a warp-striped
+  //! access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+  //! 
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //! 
+  //!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+  //!        using WarpStoreT = WarpStore<int,
+  //!                                     items_per_thread,
+  //!                                     cub::WARP_STORE_TRANSPOSE,
+  //!                                     warp_threads>;
+  //! 
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //! 
+  //!        // Allocate shared memory for WarpStore
+  //!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+  //! 
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //! 
+  //!        // Store items to linear memory
+  //!        WarpStoreT(temp_storage[warp_id]).Store(
+  //!          d_data + warp_id * tile_size, thread_data, valid_items);
+  //!
+  //! Suppose the set of ``thread_data`` across the warp threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`` and ``valid_items``
+  //! is ``5``. The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ...``,
+  //! with only the first two threads being unmasked to store portions of valid
+  //! data.
+  //! @endrst
+  //! 
+  //! @param[out] block_itr The thread block's base output iterator for storing to
+  //! @param[in] items Data to store
+  //! @param[in] valid_items Number of valid items to write
+  //! 
   template <typename OutputIteratorT>
   __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                         T (&items)[ITEMS_PER_THREAD],
@@ -529,7 +543,7 @@ public:
     InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };