From b7bfd7f98c85a2439b9711413a2dae0ff6c2c271 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Wed, 21 Jun 2023 19:45:56 +0400 Subject: [PATCH] Modernize warp store docs --- cub/warp/warp_store.cuh | 564 ++++++++++++++++++++-------------------- 1 file changed, 289 insertions(+), 275 deletions(-) diff --git a/cub/warp/warp_store.cuh b/cub/warp/warp_store.cuh index acee3d01c..c125e6d12 100644 --- a/cub/warp/warp_store.cuh +++ b/cub/warp/warp_store.cuh @@ -25,10 +25,7 @@ * ******************************************************************************/ -/** - * @file - * Operations for writing linear segments of data from the CUDA warp - */ +//! @file Operations for writing linear segments of data from the CUDA warp #pragma once @@ -45,161 +42,183 @@ CUB_NAMESPACE_BEGIN -/** - * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for - * cub::WarpStore to write a blocked arrangement of items across a CUDA - * warp to a linear segment of memory. - */ +//! @rst +//! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore` +//! to write a blocked arrangement of items across a CUDA warp to a linear segment of memory. +//! @endrst enum WarpStoreAlgorithm { - /** - * @par Overview - * A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. - * - * @par Performance Considerations - * The utilization of memory transactions (coalescing) decreases as the - * access stride between threads increases (i.e., the number items per thread). - */ + //! @rst + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! A :ref:`blocked arrangement ` of data is written directly + //! to memory. + //! + //! Performance Considerations + //! ++++++++++++++++++++++++++ + //! + //! The utilization of memory transactions (coalescing) decreases as the + //! access stride between threads increases (i.e., the number items per thread). + //! @endrst WARP_STORE_DIRECT, - /** - * @par Overview - * A [striped arrangement](index.html#sec5sec3) of data is written - * directly to memory. - * - * @par Performance Considerations - * The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - */ + //! @rst + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! A :ref:`striped arrangement ` of data is written + //! directly to memory. + //! + //! Performance Considerations + //! ++++++++++++++++++++++++++ + //! + //! The utilization of memory transactions (coalescing) remains high regardless + //! of items written per thread. + //! @endrst WARP_STORE_STRIPED, - /** - * @par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory using CUDA's built-in vectorized stores as a coalescing - * optimization. For example, st.global.v4.s32 instructions will be - * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. - * - * @par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high until - * the the access stride between threads (i.e., the number items per thread) - * exceeds the maximum vector store width (typically 4 items or 64B, - * whichever is lower). - * - The following conditions will prevent vectorization and writing will fall - * back to cub::WARP_STORE_DIRECT: - * - @p ITEMS_PER_THREAD is odd - * - The @p OutputIteratorT is not a simple pointer type - * - The block output offset is not quadword-aligned - * - The data type @p T is not a built-in primitive or CUDA vector type - * (e.g., @p short, @p int2, @p double, @p float2, etc.) - */ + //! @rst + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! A :ref:`blocked arrangement ` of data is written + //! directly to memory using CUDA's built-in vectorized stores as a coalescing + //! optimization. For example, ``st.global.v4.s32`` instructions will be + //! generated when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. + //! + //! Performance Considerations + //! ++++++++++++++++++++++++++ + //! + //! * The utilization of memory transactions (coalescing) remains high until + //! the the access stride between threads (i.e., the number items per thread) + //! exceeds the maximum vector store width (typically 4 items or 64B, + //! whichever is lower). + //! * The following conditions will prevent vectorization and writing will fall + //! back to ``cub::WARP_STORE_DIRECT``: + //! + //! * ``ITEMS_PER_THREAD`` is odd + //! * The ``OutputIteratorT`` is not a simple pointer type + //! * The block output offset is not quadword-aligned + //! * The data type ``T`` is not a built-in primitive or CUDA vector type + //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) + //! + //! @endrst WARP_STORE_VECTORIZE, - /** - * @par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a - * [striped arrangement](index.html#sec5sec3). - * - * @par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high - * regardless of items written per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives. - */ + //! @rst + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! A :ref:`blocked arrangement ` is locally + //! transposed and then efficiently written to memory as a + //! :ref:`striped arrangement `. + //! + //! Performance Considerations + //! ++++++++++++++++++++++++++ + //! + //! * The utilization of memory transactions (coalescing) remains high + //! regardless of items written per thread. + //! * The local reordering incurs slightly longer latencies and throughput than the + //! direct ``cub::WARP_STORE_DIRECT`` and ``cub::WARP_STORE_VECTORIZE`` alternatives. + //! + //! @endrst WARP_STORE_TRANSPOSE }; -/** - * @brief The WarpStore class provides [collective](index.html#sec0) - * data movement methods for writing a [blocked arrangement](index.html#sec5sec3) - * of items partitioned across a CUDA warp to a linear segment of memory. - * @ingroup WarpModule - * @ingroup UtilIo - * - * @tparam T - * The type of data to be written. - * - * @tparam ITEMS_PER_THREAD - * The number of consecutive items partitioned onto each thread. - * - * @tparam ALGORITHM - * [optional] cub::WarpStoreAlgorithm tuning policy enumeration. - * default: cub::WARP_STORE_DIRECT. - * - * @tparam LOGICAL_WARP_THREADS - * [optional] The number of threads per "logical" warp (may be less - * than the number of hardware warp threads). Default is the warp size of the - * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a - * power of two. - * - * @tparam LEGACY_PTX_ARCH - * Unused. - * - * @par Overview - * - The WarpStore class provides a single data movement abstraction that can be - * specialized to implement different cub::WarpStoreAlgorithm strategies. This - * facilitates different performance policies for different architectures, - * data types, granularity sizes, etc. - * - WarpStore can be optionally specialized by different data movement strategies: - * -# cub::WARP_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) - * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) - * -# cub::WARP_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) - * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) - * -# cub::WARP_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) - * of data is written directly to memory using CUDA's built-in vectorized - * stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm) - * -# cub::WARP_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) - * is locally transposed into a [striped arrangement](index.html#sec5sec3) - * which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm) - * - \rowmajor - * - * @par A Simple Example - * @par - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 64 integers across 16 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for - * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so - * that memory references will be efficiently coalesced using a warp-striped - * access pattern. - * @par - * @code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * constexpr int warp_threads = 16; - * constexpr int block_threads = 256; - * constexpr int items_per_thread = 4; - * - * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each - * using WarpStoreT = WarpStore; - * - * constexpr int warps_in_block = block_threads / warp_threads; - * constexpr int tile_size = items_per_thread * warp_threads; - * const int warp_id = static_cast(threadIdx.x) / warp_threads; - * - * // Allocate shared memory for WarpStore - * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); - * @endcode - * @par - * Suppose the set of @p thread_data across the warp threads is - * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. - * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... - */ +//! @rst +//! The WarpStore class provides :ref:`collective ` +//! data movement methods for writing a :ref:`blocked arrangement ` +//! of items partitioned across a CUDA warp to a linear segment of memory. +//! +//! Overview +//! ++++++++++++++++ +//! +//! * The WarpStore class provides a single data movement abstraction that can be +//! specialized to implement different cub::WarpStoreAlgorithm strategies. This +//! facilitates different performance policies for different architectures, +//! data types, granularity sizes, etc. +//! * WarpStore can be optionally specialized by different data movement strategies: +//! +//! #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`: +//! a :ref:`blocked arrangement ` of data is written directly to +//! memory. +//! #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`: +//! a :ref:`striped arrangement ` of data is written directly to +//! memory. +//! #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`: +//! a :ref:`blocked arrangement ` of data is written directly to +//! memory using CUDA's built-in vectorized stores as a coalescing optimization. +//! #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`: +//! a :ref:`blocked arrangement ` is locally transposed into a +//! :ref:`striped arrangement ` which is then written to memory. +//! +//! * @rowmajor +//! +//! A Simple Example +//! ++++++++++++++++ +//! +//! The code snippet below illustrates the storing of a "blocked" arrangement +//! of 64 integers across 16 threads (where each thread owns 4 consecutive items) +//! into a linear segment of memory. The store is specialized for +//! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so +//! that memory references will be efficiently coalesced using a warp-striped +//! access pattern. +//! +//! .. code-block:: c++ +//! +//! #include // or equivalently +//! +//! __global__ void ExampleKernel(int *d_data, ...) +//! { +//! constexpr int warp_threads = 16; +//! constexpr int block_threads = 256; +//! constexpr int items_per_thread = 4; +//! +//! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each +//! using WarpStoreT = WarpStore; +//! +//! constexpr int warps_in_block = block_threads / warp_threads; +//! constexpr int tile_size = items_per_thread * warp_threads; +//! const int warp_id = static_cast(threadIdx.x) / warp_threads; +//! +//! // Allocate shared memory for WarpStore +//! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; +//! +//! // Obtain a segment of consecutive items that are blocked across threads +//! int thread_data[4]; +//! ... +//! +//! // Store items to linear memory +//! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); +//! +//! Suppose the set of ``thread_data`` across the warp threads is +//! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. +//! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. +//! @endrst +//! +//! @tparam T +//! The type of data to be written. +//! +//! @tparam ITEMS_PER_THREAD +//! The number of consecutive items partitioned onto each thread. +//! +//! @tparam ALGORITHM +//! [optional] cub::WarpStoreAlgorithm tuning policy enumeration. +//! default: cub::WARP_STORE_DIRECT. +//! +//! @tparam LOGICAL_WARP_THREADS +//! [optional] The number of threads per "logical" warp (may be less +//! than the number of hardware warp threads). Default is the warp size of the +//! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a +//! power of two. +//! +//! @tparam LEGACY_PTX_ARCH +//! Unused. template struct StoreInternal { - typedef NullType TempStorage; + using TempStorage = NullType; int linear_tid; @@ -250,7 +269,7 @@ private: template struct StoreInternal { - typedef NullType TempStorage; + using TempStorage = NullType; int linear_tid; @@ -282,7 +301,7 @@ private: template struct StoreInternal { - typedef NullType TempStorage; + using TempStorage = NullType; int linear_tid; @@ -379,86 +398,79 @@ public: struct TempStorage : Uninitialized<_TempStorage> {}; - /*************************************************************************//** - * @name Collective constructors - ****************************************************************************/ - //@{ + //! @name Collective constructors + //! @{ - /** - * @brief Collective constructor using a private static allocation of shared - * memory as temporary storage. - */ + //! @brief Collective constructor using a private static allocation of shared + //! memory as temporary storage. __device__ __forceinline__ WarpStore() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} - /** - * @brief Collective constructor using the specified memory allocation as - * temporary storage. - */ + //! @brief Collective constructor using the specified memory allocation as + //! temporary storage. __device__ __forceinline__ WarpStore(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} - //@} end member group - /*************************************************************************//** - * @name Data movement - ****************************************************************************/ - //@{ - - /** - * @brief Store items into a linear segment of memory. - * - * @par - * @smemwarpreuse - * - * @par Snippet - * @par - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 64 integers across 16 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for - * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so - * that memory references will be efficiently coalesced using a warp-striped - * access pattern. - * @code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * constexpr int warp_threads = 16; - * constexpr int block_threads = 256; - * constexpr int items_per_thread = 4; - * - * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each - * using WarpStoreT = WarpStore; - * - * constexpr int warps_in_block = block_threads / warp_threads; - * constexpr int tile_size = items_per_thread * warp_threads; - * const int warp_id = static_cast(threadIdx.x) / warp_threads; - * - * // Allocate shared memory for WarpStore - * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); - * @endcode - * @par - * Suppose the set of @p thread_data across the warp threads is - * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. - * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... - * - * @param[out] block_itr The thread block's base output iterator for storing to - * @param[in] items Data to store - */ + //! @} end member group + //! @name Data movement + //! @{ + + //! @rst + //! Store items into a linear segment of memory. + //! + //! @smemwarpreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the storing of a "blocked" arrangement + //! of 64 integers across 16 threads (where each thread owns 4 consecutive items) + //! into a linear segment of memory. The store is specialized for + //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so + //! that memory references will be efficiently coalesced using a warp-striped + //! access pattern. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(int *d_data, ...) + //! { + //! constexpr int warp_threads = 16; + //! constexpr int block_threads = 256; + //! constexpr int items_per_thread = 4; + //! + //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each + //! using WarpStoreT = WarpStore; + //! + //! constexpr int warps_in_block = block_threads / warp_threads; + //! constexpr int tile_size = items_per_thread * warp_threads; + //! const int warp_id = static_cast(threadIdx.x) / warp_threads; + //! + //! // Allocate shared memory for WarpStore + //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; + //! + //! // Obtain a segment of consecutive items that are blocked across threads + //! int thread_data[4]; + //! ... + //! + //! // Store items to linear memory + //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); + //! + //! Suppose the set of ``thread_data`` across the warp threads is + //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. + //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. + //! @endrst + //! + //! @param[out] block_itr The thread block's base output iterator for storing to + //! @param[in] items Data to store template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) @@ -466,61 +478,63 @@ public: InternalStore(temp_storage, linear_tid).Store(block_itr, items); } - /** - * @brief Store items into a linear segment of memory, guarded by range. - * - * @par - * @smemwarpreuse - * - * @par Snippet - * @par - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 64 integers across 16 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for - * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so - * that memory references will be efficiently coalesced using a warp-striped - * access pattern. - * @code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items ...) - * { - * constexpr int warp_threads = 16; - * constexpr int block_threads = 256; - * constexpr int items_per_thread = 4; - * - * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each - * using WarpStoreT = WarpStore; - * - * constexpr int warps_in_block = block_threads / warp_threads; - * constexpr int tile_size = items_per_thread * warp_threads; - * const int warp_id = static_cast(threadIdx.x) / warp_threads; - * - * // Allocate shared memory for WarpStore - * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * WarpStoreT(temp_storage[warp_id]).Store( - * d_data + warp_id * tile_size, thread_data, valid_items); - * @endcode - * @par - * Suppose the set of @p thread_data across the warp threads is - * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] } and @p valid_items - * is @p 5.. The output @p d_data will be 0, 1, 2, 3, 4, ?, ?, ..., - * with only the first two threads being unmasked to store portions of valid - * data. - * - * @param[out] block_itr The thread block's base output iterator for storing to - * @param[in] items Data to store - * @param[in] valid_items Number of valid items to write - */ + //! @rst + //! Store items into a linear segment of memory, guarded by range. + //! + //! @smemwarpreuse + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the storing of a "blocked" arrangement + //! of 64 integers across 16 threads (where each thread owns 4 consecutive items) + //! into a linear segment of memory. The store is specialized for + //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so + //! that memory references will be efficiently coalesced using a warp-striped + //! access pattern. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! __global__ void ExampleKernel(int *d_data, int valid_items ...) + //! { + //! constexpr int warp_threads = 16; + //! constexpr int block_threads = 256; + //! constexpr int items_per_thread = 4; + //! + //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each + //! using WarpStoreT = WarpStore; + //! + //! constexpr int warps_in_block = block_threads / warp_threads; + //! constexpr int tile_size = items_per_thread * warp_threads; + //! const int warp_id = static_cast(threadIdx.x) / warp_threads; + //! + //! // Allocate shared memory for WarpStore + //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; + //! + //! // Obtain a segment of consecutive items that are blocked across threads + //! int thread_data[4]; + //! ... + //! + //! // Store items to linear memory + //! WarpStoreT(temp_storage[warp_id]).Store( + //! d_data + warp_id * tile_size, thread_data, valid_items); + //! + //! Suppose the set of ``thread_data`` across the warp threads is + //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`` and ``valid_items`` + //! is ``5``. The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ...``, + //! with only the first two threads being unmasked to store portions of valid + //! data. + //! @endrst + //! + //! @param[out] block_itr The thread block's base output iterator for storing to + //! @param[in] items Data to store + //! @param[in] valid_items Number of valid items to write + //! template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], @@ -529,7 +543,7 @@ public: InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } - //@} end member group + //! @} end member group };