diff --git a/cub/warp/warp_store.cuh b/cub/warp/warp_store.cuh index c125e6d12..37a9d6519 100644 --- a/cub/warp/warp_store.cuh +++ b/cub/warp/warp_store.cuh @@ -29,21 +29,19 @@ #pragma once -#include -#include - #include #include #include #include #include +#include +#include CUB_NAMESPACE_BEGIN - //! @rst -//! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore` +//! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore` //! to write a blocked arrangement of items across a CUDA warp to a linear segment of memory. //! @endrst enum WarpStoreAlgorithm @@ -52,7 +50,7 @@ enum WarpStoreAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! A :ref:`blocked arrangement ` of data is written directly + //! A :ref:`blocked arrangement ` of data is written directly //! to memory. //! //! Performance Considerations @@ -81,15 +79,15 @@ enum WarpStoreAlgorithm //! @rst //! Overview //! ++++++++++++++++++++++++++ - //! + //! //! A :ref:`blocked arrangement ` of data is written //! directly to memory using CUDA's built-in vectorized stores as a coalescing //! optimization. For example, ``st.global.v4.s32`` instructions will be //! generated when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. - //! + //! //! Performance Considerations //! ++++++++++++++++++++++++++ - //! + //! //! * The utilization of memory transactions (coalescing) remains high until //! the the access stride between threads (i.e., the number items per thread) //! exceeds the maximum vector store width (typically 4 items or 64B, @@ -113,7 +111,7 @@ enum WarpStoreAlgorithm //! A :ref:`blocked arrangement ` is locally //! transposed and then efficiently written to memory as a //! :ref:`striped arrangement `. - //! + //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! @@ -121,17 +119,16 @@ enum WarpStoreAlgorithm //! regardless of items written per thread. //! * The local reordering incurs slightly longer latencies and throughput than the //! direct ``cub::WARP_STORE_DIRECT`` and ``cub::WARP_STORE_VECTORIZE`` alternatives. - //! + //! //! @endrst WARP_STORE_TRANSPOSE }; - //! @rst -//! The WarpStore class provides :ref:`collective ` +//! The WarpStore class provides :ref:`collective ` //! data movement methods for writing a :ref:`blocked arrangement ` //! of items partitioned across a CUDA warp to a linear segment of memory. -//! +//! //! Overview //! ++++++++++++++++ //! @@ -141,21 +138,21 @@ enum WarpStoreAlgorithm //! data types, granularity sizes, etc. //! * WarpStore can be optionally specialized by different data movement strategies: //! -//! #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`: -//! a :ref:`blocked arrangement ` of data is written directly to -//! memory. -//! #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`: -//! a :ref:`striped arrangement ` of data is written directly to -//! memory. -//! #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`: -//! a :ref:`blocked arrangement ` of data is written directly to -//! memory using CUDA's built-in vectorized stores as a coalescing optimization. -//! #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`: -//! a :ref:`blocked arrangement ` is locally transposed into a -//! :ref:`striped arrangement ` which is then written to memory. +//! #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`: +//! a :ref:`blocked arrangement ` of data is written directly to +//! memory. +//! #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`: +//! a :ref:`striped arrangement ` of data is written directly to +//! memory. +//! #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`: +//! a :ref:`blocked arrangement ` of data is written directly to +//! memory using CUDA's built-in vectorized stores as a coalescing optimization. +//! #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`: +//! a :ref:`blocked arrangement ` is locally transposed into a +//! :ref:`striped arrangement ` which is then written to memory. //! //! * @rowmajor -//! +//! //! A Simple Example //! ++++++++++++++++ //! @@ -169,30 +166,30 @@ enum WarpStoreAlgorithm //! .. code-block:: c++ //! //! #include // or equivalently -//! +//! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; -//! +//! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; -//! +//! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; -//! +//! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; -//! +//! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... -//! +//! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); //! @@ -200,30 +197,30 @@ enum WarpStoreAlgorithm //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! @endrst -//! +//! //! @tparam T //! The type of data to be written. -//! +//! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. -//! +//! //! @tparam ALGORITHM //! [optional] cub::WarpStoreAlgorithm tuning policy enumeration. //! default: cub::WARP_STORE_DIRECT. -//! +//! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less //! than the number of hardware warp threads). Default is the warp size of the //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. -//! +//! //! @tparam LEGACY_PTX_ARCH //! Unused. -template +template class WarpStore { static_assert(PowerOfTwo::VALUE, @@ -232,7 +229,6 @@ class WarpStore constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); private: - /// Store helper template struct StoreInternal; @@ -244,14 +240,12 @@ private: int linear_tid; - __device__ __forceinline__ StoreInternal(TempStorage &/*temp_storage*/, - int linear_tid) - : linear_tid(linear_tid) + __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) + : linear_tid(linear_tid) {} template - __device__ __forceinline__ void Store(OutputIteratorT block_itr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } @@ -265,7 +259,6 @@ private: } }; - template struct StoreInternal { @@ -273,14 +266,12 @@ private: int linear_tid; - __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, - int linear_tid) + __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template - __device__ __forceinline__ void Store(OutputIteratorT block_itr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } @@ -290,14 +281,10 @@ private: T (&items)[ITEMS_PER_THREAD], int valid_items) { - StoreDirectStriped(linear_tid, - block_itr, - items, - valid_items); + StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; - template struct StoreInternal { @@ -305,20 +292,17 @@ private: int linear_tid; - __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, - int linear_tid) + __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - __device__ __forceinline__ void Store(T *block_ptr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(T *block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } template - __device__ __forceinline__ void Store(OutputIteratorT block_itr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } @@ -332,31 +316,28 @@ private: } }; - template struct StoreInternal { - using WarpExchangeT = - WarpExchange; + using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; - struct TempStorage : Uninitialized<_TempStorage> {}; + struct TempStorage : Uninitialized<_TempStorage> + {}; _TempStorage &temp_storage; int linear_tid; - __device__ __forceinline__ StoreInternal(TempStorage &temp_storage, - int linear_tid) + __device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template - __device__ __forceinline__ void Store(OutputIteratorT block_itr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items); @@ -368,35 +349,29 @@ private: int valid_items) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); - StoreDirectStriped(linear_tid, - block_itr, - items, - valid_items); + StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; - /// Internal load implementation to use using InternalStore = StoreInternal; /// Shared memory storage layout type using _TempStorage = typename InternalStore::TempStorage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() + __device__ __forceinline__ _TempStorage &PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } - _TempStorage &temp_storage; int linear_tid; public: - - struct TempStorage : Uninitialized<_TempStorage> {}; + struct TempStorage : Uninitialized<_TempStorage> + {}; //! @name Collective constructors //! @{ @@ -421,9 +396,9 @@ public: //! @rst //! Store items into a linear segment of memory. - //! + //! //! @smemwarpreuse - //! + //! //! Snippet //! +++++++ //! @@ -437,30 +412,30 @@ public: //! .. code-block:: c++ //! //! #include // or equivalently - //! + //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; - //! + //! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; - //! + //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; - //! + //! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - //! + //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... - //! + //! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); //! @@ -468,21 +443,20 @@ public: //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! @endrst - //! + //! //! @param[out] block_itr The thread block's base output iterator for storing to //! @param[in] items Data to store template - __device__ __forceinline__ void Store(OutputIteratorT block_itr, - T (&items)[ITEMS_PER_THREAD]) + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } //! @rst //! Store items into a linear segment of memory, guarded by range. - //! + //! //! @smemwarpreuse - //! + //! //! Snippet //! +++++++ //! @@ -496,30 +470,30 @@ public: //! .. code-block:: c++ //! //! #include // or equivalently - //! + //! //! __global__ void ExampleKernel(int *d_data, int valid_items ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; - //! + //! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; - //! + //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; - //! + //! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - //! + //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... - //! + //! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store( //! d_data + warp_id * tile_size, thread_data, valid_items); @@ -530,11 +504,11 @@ public: //! with only the first two threads being unmasked to store portions of valid //! data. //! @endrst - //! + //! //! @param[out] block_itr The thread block's base output iterator for storing to //! @param[in] items Data to store //! @param[in] valid_items Number of valid items to write - //! + //! template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], @@ -546,5 +520,4 @@ public: //! @} end member group }; - CUB_NAMESPACE_END