diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index 256c7fb488..a781d68e68 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockExchange class provides :ref:`collective ` methods for -//! rearranging data partitioned across a CUDA thread block. +//! @file +//! The cub::BlockExchange class provides :ref:`collective ` methods for +//! rearranging data partitioned across a CUDA thread block. #pragma once @@ -55,11 +56,10 @@ CUB_NAMESPACE_BEGIN //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! - It is commonplace for blocks of threads to rearrange data items between -//! threads. For example, the device-accessible memory subsystem prefers access patterns -//! where data items are "striped" across threads (where consecutive threads access consecutive items), -//! yet most block-wide operations prefer a "blocked" partitioning of items across threads -//! (where consecutive items belong to a single thread). +//! - It is commonplace for blocks of threads to rearrange data items between threads. For example, the +//! device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where +//! consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of +//! items across threads (where consecutive items belong to a single thread). //! - BlockExchange supports the following types of data exchanges: //! //! - Transposing between :ref:`blocked ` and :ref:`striped ` @@ -76,8 +76,8 @@ CUB_NAMESPACE_BEGIN //! //! @blockcollective{BlockExchange} //! -//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement -//! of 512 integer items partitioned across 128 threads where each thread owns 4 items. +//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items +//! partitioned across 128 threads where each thread owns 4 items. //! //! .. code-block:: c++ //! @@ -98,9 +98,8 @@ CUB_NAMESPACE_BEGIN //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data); //! -//! Suppose the set of striped input ``thread_data`` across the block of threads is -//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``. -//! The corresponding output ``thread_data`` in those threads will be +//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384], +//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! //! Performance Considerations @@ -112,33 +111,33 @@ CUB_NAMESPACE_BEGIN //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with -//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to -//! the storage required by BlockExchange. +//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required +//! by BlockExchange. //! @endrst //! //! @tparam T -//! The data type to be exchanged +//! The data type to be exchanged //! //! @tparam BLOCK_DIM_X -//! The thread block length in threads along the X dimension +//! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD -//! The number of items partitioned onto each thread. +//! The number of items partitioned onto each thread. //! //! @tparam WARP_TIME_SLICING -//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data, -//! time-slicing the block-wide exchange over multiple synchronized rounds. -//! Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) +//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of +//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint +//! at the expense of decreased parallelism. (Default: false) //! //! @tparam BLOCK_DIM_Y -//! **[optional]** The thread block length in threads along the Y dimension (default: 1) +//! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z -//! **[optional]** The thread block length in threads along the Z dimension (default: 1) +//! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH -//! [optional] Unused. -template [optional] Unused. +template class BlockExchange { -private: - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, - - TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, - TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise - // we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, - }; + static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in + // C++14 + static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0); + + static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; + static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1; + static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS; + static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD; + static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS); + static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD; + + // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise + // we can typically use 128b loads) + static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo::VALUE; + static constexpr int PADDING_ITEMS = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0; /// Shared memory storage layout type - struct __align__(16) _TempStorage + struct alignas(16) _TempStorage { - InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; + T buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; }; public: /// @smemstorage{BlockExchange} - struct TempStorage : Uninitialized<_TempStorage> - {}; + using TempStorage = Uninitialized<_TempStorage>; private: - /// Shared storage reference _TempStorage& temp_storage; - /// Linear thread-id - unsigned int linear_tid; - unsigned int lane_id; - unsigned int warp_id; - unsigned int warp_offset; + // TODO(bgruber): can we use signed int here? Only these variables are unsigned: + unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); + unsigned int lane_id = LaneId(); + unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS; + unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() @@ -205,8 +190,8 @@ private: return private_storage; } - //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. - //! Specialized for no timeslicing. + //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for no + //! timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -215,35 +200,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **blocked** arrangement to **striped** - //! arrangement. Specialized for warp-timeslicing. + //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for + //! warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -252,51 +239,51 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } @@ -304,14 +291,14 @@ private: // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. - //! Specialized for no timeslicing + //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for no + //! timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -320,35 +307,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD); + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. - //! Specialized for warp-timeslicing + //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for + //! warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -357,71 +346,71 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { if (warp_id == 0) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } #pragma unroll - for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) + for (int slice = 1; slice < TIME_SLICES; ++slice) { CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } } } - //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. - //! Specialized for no timeslicing. + //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for no + //! timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -430,36 +419,38 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); // No timeslicing #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for + //! warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -468,67 +459,67 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { // Warp time-slicing - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Write a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } } CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. - //! Specialized for no timeslicing + //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for no + //! timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -537,35 +528,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD); + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(output_items + ITEM, temp_storage.buff[item_offset]); + detail::uninitialized_copy_single(output_items + i, temp_storage.buff[item_offset]); } } - //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. - //! Specialized for warp-timeslicing + //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for + //! warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -574,44 +567,43 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + for (int slice = 0; slice < TIME_SLICES; ++slice) { CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } } } - //! @brief Exchanges data items annotated by rank into **blocked** arrangement. - //! Specialized for no timeslicing. + //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -623,38 +615,37 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Exchanges data items annotated by rank into **blocked** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -666,61 +657,60 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { CTA_SYNC(); - const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + const int slice_offset = TIME_SLICED_ITEMS * slice; #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + int item_offset = ranks[i] - slice_offset; + if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Exchanges data items annotated by rank into **striped** arrangement. - //! Specialized for no timeslicing. + //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -732,38 +722,37 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Exchanges data items annotated by rank into **striped** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -775,54 +764,54 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + int item_offset = ranks[i] - slice_offset; + if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } @@ -830,9 +819,9 @@ private: // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } @@ -840,29 +829,15 @@ public: //! @name Collective constructors //! @{ - /** - * @brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ + //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange() : temp_storage(PrivateStorage()) - , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - , lane_id(LaneId()) - , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} - /** - * @brief Collective constructor using the specified memory allocation as temporary storage. - * - * @param[in] temp_storage - * Reference to memory allocation having layout type TempStorage - */ + //! @brief Collective constructor using the specified memory allocation as temporary storage. + //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - , lane_id(LaneId()) - , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} //! @} end member group @@ -899,10 +874,9 @@ public: //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); //! - //! Suppose the set of striped input ``thread_data`` across the block of threads is - //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from - //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. + //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384], + //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output + //! ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items @@ -912,7 +886,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { StripedToBlocked(input_items, output_items, Int2Type()); } @@ -950,11 +924,10 @@ public: //! // Store data striped across block threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! - //! Suppose the set of blocked input ``thread_data`` across the block of threads is - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in - //! preparation for storing to device-accessible memory. + //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7], + //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in preparation for storing to device-accessible + //! memory. //! @endrst //! //! @param[in] input_items @@ -964,7 +937,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToStriped(input_items, output_items, Int2Type()); } @@ -1001,12 +974,11 @@ public: //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); //! - //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is - //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` - //! after loading from device-accessible memory. (The first 128 items are striped across - //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. + //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96], + //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128 + //! items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, + //! etc.) The corresponding output ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], + //! ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items @@ -1016,7 +988,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(input_items, output_items, Int2Type()); } @@ -1056,12 +1028,11 @@ public: //! // Store data striped across warp threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! - //! Suppose the set of blocked input ``thread_data`` across the block of threads is - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` - //! in preparation for storing to device-accessible memory. (The first 128 items are striped - //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7], + //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` in preparation for storing to + //! device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 + //! items are striped across the second warp, etc.) //! @endrst //! //! @param[in] input_items @@ -1071,7 +1042,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(input_items, output_items, Int2Type()); } @@ -1099,7 +1070,7 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { @@ -1126,7 +1097,7 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { @@ -1153,35 +1124,35 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - if (ranks[ITEM] >= 0) + if (ranks[i] >= 0) { - temp_storage.buff[item_offset] = input_items[ITEM]; + temp_storage.buff[item_offset] = input_items[i]; } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } @@ -1211,36 +1182,36 @@ public: //! Corresponding flag denoting item validity template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - if (is_valid[ITEM]) + if (is_valid[i]) { - temp_storage.buff[item_offset] = input_items[ITEM]; + temp_storage.buff[item_offset] = input_items[i]; } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } @@ -1248,97 +1219,75 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD]) { StripedToBlocked(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD]) { BlockedToStriped(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// + /// @param[in] ranks + /// Corresponding scatter ranks template - _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks template - _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStripedGuarded(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - * - * @param[in] is_valid - * Corresponding flag denoting item validity - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks + /// @param[in] is_valid + /// Corresponding flag denoting item validity template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( - InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) + T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks, is_valid); } diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 87adeb5451..76c073f1b5 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -26,7 +26,7 @@ * ******************************************************************************/ -//! @file Operations for reading linear tiles of data into the CUDA thread block. +//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. #pragma once @@ -54,7 +54,6 @@ CUB_NAMESPACE_BEGIN //! Load a linear segment of items into a blocked arrangement across the thread block. //! //! @blocked -//! //! @endrst //! //! @tparam T @@ -63,27 +62,27 @@ CUB_NAMESPACE_BEGIN //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr +//! @param[in] block_src_it //! The thread block's base input iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { // Load directly in thread-blocked order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i]; } } @@ -100,31 +99,32 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items -//! Number of valid items to load -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +//! @param[in] block_items_end +//! First out-of-bounds index when loading from block_src_it +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) + const auto src_pos = linear_tid * ITEMS_PER_THREAD + i; + if (src_pos < block_items_end) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + dst_items[i] = block_src_it[src_pos]; } } } @@ -143,35 +143,39 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr +//! @param[in] block_src_it //! The thread block's base input iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items -//! Number of valid items to load +//! @param[in] block_items_end +//! First out-of-bounds index when loading from block_src_it //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document @@ -179,58 +183,44 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( //! @brief Internal implementation for load vectorization //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_ptr +//! @param[in] block_src_ptr //! Input pointer for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into template _CCCL_DEVICE _CCCL_FORCEINLINE void -InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) +InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - // Biggest memory access word that T is a whole multiple of - using DeviceWord = typename UnitWord::DeviceWord; - + // Find biggest memory access word that T is a whole multiple of + using device_word_t = typename UnitWord::DeviceWord; _CCCL_DIAG_PUSH # if defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000 _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div") # endif // defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000 - enum - { - TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), - - VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4 - : (TOTAL_WORDS % 2 == 0) ? 2 - : 1, - - VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, - }; + constexpr int total_words = static_cast(sizeof(dst_items) / sizeof(device_word_t)); _CCCL_DIAG_POP + constexpr int vector_size = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1; + constexpr int vectors_per_thread = total_words / vector_size; + using vector_t = typename CubVector::Type; - // Vector type - using Vector = typename CubVector::Type; - - // Vector items - Vector vec_items[VECTORS_PER_THREAD]; - - // Aliased input ptr - Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); - -// Load directly in thread-blocked order + // Load into an array of vectors in thread-blocked order + vector_t vec_items[vectors_per_thread]; + const vector_t* vec_ptr = reinterpret_cast(block_src_ptr) + linear_tid * vectors_per_thread; # pragma unroll - for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + for (int i = 0; i < vectors_per_thread; i++) { - vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); + vec_items[i] = ThreadLoad(vec_ptr + i); } -// Copy +// Copy to destination # pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); + dst_items[i] = *(reinterpret_cast(vec_items) + i); } } @@ -258,19 +248,19 @@ InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITE //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_ptr -//! Input pointer for loading from +//! @param[in] block_src_ptr +//! The thread block's base pointer for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! destination to load data into template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) +LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_src_ptr, dst_items); } //! @} end member group @@ -293,43 +283,41 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_T //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; + dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS]; } } namespace detail { - -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op) + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]); + dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]); } } - } // namespace detail //! @rst @@ -348,32 +336,32 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped( //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT -//! **inferred** The random-access iterator type for input \iterator. +//! @tparam RandomAccessIterator +//! **inferred** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + +//! linear_tid for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load -//! -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) + const auto src_pos = linear_tid + i * BLOCK_THREADS; + if (src_pos < block_items_end) { - items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; + dst_items[i] = block_src_it[src_pos]; } } } @@ -395,35 +383,39 @@ LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectStriped(linear_tid, block_itr, items, valid_items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); } //! @} end member group @@ -448,31 +440,31 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + const int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + const int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); + new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * CUB_PTX_WARP_THREADS)]); } } @@ -494,36 +486,37 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + const int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + const int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + const auto src_pos = warp_offset + tid + (i * CUB_PTX_WARP_THREADS); + if (src_pos < block_items_end) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); + new (&dst_items[i]) T(block_src_it[src_pos]); } } } @@ -547,42 +540,46 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); } //! @} end member group -//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a -//! linear segment of data from memory into a blocked arrangement across a CUDA thread block. +//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data +//! from memory into a blocked arrangement across a CUDA thread block. enum BlockLoadAlgorithm { //! @rst @@ -594,8 +591,8 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! The utilization of memory transactions (coalescing) decreases as the - //! access stride between threads increases (i.e., the number items per thread). + //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases + //! (i.e., the number items per thread). //! @endrst BLOCK_LOAD_DIRECT, @@ -608,8 +605,7 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! The utilization of memory transactions (coalescing) doesn't depend on - //! the number of items per thread. + //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread. //! //! @endrst BLOCK_LOAD_STRIPED, @@ -618,22 +614,20 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! A :ref:`blocked arrangement ` of data is read - //! from memory using CUDA's built-in vectorized loads as a coalescing optimization. - //! For example, ``ld.global.v4.s32`` instructions will be generated + //! A :ref:`blocked arrangement ` of data is read from memory using CUDA's built-in + //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high until the the - //! access stride between threads (i.e., the number items per thread) exceeds the - //! maximum vector load width (typically 4 items or 64B, whichever is lower). - //! - The following conditions will prevent vectorization and loading will fall - //! back to cub::BLOCK_LOAD_DIRECT: + //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads + //! (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever + //! is lower). + //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd - //! - The ``InputIteratorT`` is not a simple pointer type + //! - The ``RandomAccessIterator`` is not a simple pointer type //! - The block input offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) @@ -645,16 +639,15 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! A :ref:`striped arrangement ` of data is read efficiently from memory and then - //! locally transposed into a :ref:`blocked arrangement `. + //! A :ref:`striped arrangement ` of data is read efficiently from memory and then locally + //! transposed into a :ref:`blocked arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high regardless - //! of items loaded per thread. - //! - The local reordering incurs slightly longer latencies and throughput than the - //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. + //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and + //! cub::BLOCK_LOAD_VECTORIZE alternatives. //! //! @endrst BLOCK_LOAD_TRANSPOSE, @@ -675,8 +668,8 @@ enum BlockLoadAlgorithm //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. - //! - The local reordering incurs slightly larger latencies than the - //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and + //! cub::BLOCK_LOAD_VECTORIZE alternatives. //! - Provisions more shared storage, but incurs smaller latencies than the //! BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. //! @@ -687,10 +680,10 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement ` - //! of data is read directly from memory and then is locally transposed into a - //! :ref:`blocked arrangement `. To reduce the shared memory requirement, only one - //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps. + //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement ` of data is read + //! directly from memory and then is locally transposed into a :ref:`blocked arrangement `. + //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently + //! time-sliced among warps. //! //! Usage Considerations //! ++++++++++++++++++++++++++ @@ -700,10 +693,9 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high regardless - //! of items loaded per thread. - //! - Provisions less shared memory temporary storage, but incurs larger - //! latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. + //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. + //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE + //! alternative. //! //! @endrst BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, @@ -711,15 +703,15 @@ enum BlockLoadAlgorithm //! @rst //! The BlockLoad class provides :ref:`collective ` data movement methods for loading a linear -//! segment of items from memory into a :ref:`blocked arrangement ` across a -//! CUDA thread block. +//! segment of items from memory into a :ref:`blocked arrangement ` across a CUDA thread +//! block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! - The BlockLoad class provides a single data movement abstraction that can be specialized -//! to implement different cub::BlockLoadAlgorithm strategies. This facilitates different -//! performance policies for different architectures, data types, granularity sizes, etc. +//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different +//! cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures, +//! data types, granularity sizes, etc. //! - BlockLoad can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`: @@ -746,10 +738,9 @@ enum BlockLoadAlgorithm //! //! @blockcollective{BlockLoad} //! -//! The code snippet below illustrates the loading of a linear -//! segment of 512 integers into a "blocked" arrangement across 128 threads where each -//! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, -//! meaning memory references are efficiently coalesced using a warp-striped access +//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement +//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for +//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ @@ -768,21 +759,20 @@ enum BlockLoadAlgorithm //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! -//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. -//! The set of ``thread_data`` across the block of threads in those threads will be -//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. +//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in +//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of -//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. -//! This example can be easily adapted to the storage required by BlockLoad. +//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with +//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required +//! by BlockLoad. //! //! @endrst //! -//! @tparam InputT -//! The data type to read into (which must be convertible from the input iterator's value type). +//! @tparam T +// The data type to read into (which must be convertible from the input iterator's value type). //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension @@ -793,20 +783,15 @@ enum BlockLoadAlgorithm //! @tparam ALGORITHM //! **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``. //! -//! @tparam WARP_TIME_SLICING -//! **[optional]** Whether or not only one warp's worth of shared memory should be -//! allocated and time-sliced among block-warps during any load-related data transpositions -//! (versus each warp having its own storage). (default: false) -//! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z -//! **[optional]** The thread block length in threads along the Z dimension (default: 1) +//! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. -template class BlockLoad { -private: - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; + static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block - /// Load helper template - struct LoadInternal; + struct LoadInternal; // helper to dispatch the load algorithm - /** - * BLOCK_LOAD_DIRECT specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectBlocked(linear_tid, block_itr, items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_STRIPED specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectStriped(linear_tid, block_itr, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_VECTORIZE specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_ptr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) + // attempts vectorization (pointer) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_ptr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) + // any other iterator, no vectorization + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ + // attempts vectorization (cache modified iterator) template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) + Load(CacheModifiedInputIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_src_it.ptr, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for opaque input iterators - * (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /** - * @brief Load a linear segment of items from memory, guarded by range (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + // skips vectorization + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + // skips vectorization + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_TRANSPOSE specialization of load helper - */ template struct LoadInternal { - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } }; - /** - * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper - */ template struct LoadInternal { - enum - { - WARP_THREADS = CUB_WARP_THREADS(0) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } }; - /** - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper - */ template struct LoadInternal { - enum - { - WARP_THREADS = CUB_WARP_THREADS(0) - }; + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } }; - /// Internal load implementation to use - using InternalLoad = LoadInternal; - - /// Shared memory storage layout type + using InternalLoad = LoadInternal; // load implementation to use using _TempStorage = typename InternalLoad::TempStorage; - /// Internal storage allocator + // Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; public: /// @smemstorage{BlockLoad} - struct TempStorage : Uninitialized<_TempStorage> - {}; + using TempStorage = Uninitialized<_TempStorage>; //! @name Collective constructors //! @{ - /** - * @brief Collective constructor using a private static allocation of shared memory as temporary - * storage. - */ + /// @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * @brief Collective constructor using the specified memory allocation as temporary storage. - * - * @param[in] temp_storage - * Reference to memory allocation having layout type TempStorage - */ + /// @brief Collective constructor using the specified memory allocation as temporary storage. + /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) @@ -1448,10 +1085,9 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement + //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ @@ -1470,21 +1106,20 @@ public: //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads + //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + //! @param[out] dst_items + //! Destination to load data into + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items); } //! @rst @@ -1497,17 +1132,16 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the guarded loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" + //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! - //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) + //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! using BlockLoad = cub::BlockLoad; @@ -1517,32 +1151,32 @@ public: //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; - //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads - //! being unmasked to load portions of valid data (and other items remaining unassigned). + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of + //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, + //! with only the first two threads being unmasked to load portions of valid data (and other items remaining + //! unassigned). //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load + //! @param[out] dst_items + //! Destination to load data into //! - //! @param[in] valid_items + //! @param[in] block_items_end //! Number of valid items to load - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end); } //! @rst - //! Load a linear segment of items from memory, guarded by range, with a fall-back - //! assignment of out-of-bound elements + //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements //! //! - @blocked //! - @smemreuse @@ -1550,17 +1184,16 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the guarded loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" + //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! - //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) + //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! using BlockLoad = cub::BlockLoad; @@ -1570,35 +1203,34 @@ public: //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; - //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` - //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads - //! being unmasked to load portions of valid data (and other items are assigned ``-1``) + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds + //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be + //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load + //! portions of valid data (and other items are assigned ``-1``) //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load + //! @param[out] dst_items + //! Destination to load data into //! - //! @param[in] valid_items + //! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default); } - //@} end member group + //! @} end member group }; template > diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh index 9a3f01e2c0..326826c0d1 100644 --- a/cub/cub/detail/uninitialized_copy.cuh +++ b/cub/cub/detail/uninitialized_copy.cuh @@ -58,6 +58,7 @@ template ::value, int>::type = 0> _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val) { + // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better *ptr = ::cuda::std::forward(val); } diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index a572fa5056..3db8d73031 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -102,11 +102,11 @@ enum CacheLoadModifier * @tparam MODIFIER * [inferred] CacheLoadModifier enumeration * - * @tparam InputIteratorT - * [inferred] Input iterator type \iterator + * @tparam RandomAccessIterator + * [inferred] The input's iterator type \iterator */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr); +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr); //@} end member group @@ -125,9 +125,9 @@ struct IterateThreadLoad IterateThreadLoad::template Load(ptr, vals); } - template + template CUB_DEPRECATED_BECAUSE("Use UnrolledCopy() instead") - static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT itr, T* vals) + static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals) { vals[COUNT] = itr[COUNT]; IterateThreadLoad::Dereference(itr, vals); @@ -142,8 +142,8 @@ struct IterateThreadLoad static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/) {} - template - static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT /*itr*/, T* /*vals*/) + template + static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/) {} }; @@ -309,9 +309,9 @@ _CUB_LOAD_ALL(LOAD_LDG, global.nc) /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t -ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t +ThreadLoad(RandomAccessIterator itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *itr; } @@ -320,7 +320,8 @@ ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T +ThreadLoad(const T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *ptr; } @@ -329,9 +330,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*mod * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types */ template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type /*is_primitive*/) { - T retval = *reinterpret_cast(ptr); + T retval = *reinterpret_cast(ptr); return retval; } @@ -339,16 +340,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type /*is_primitive*/) { - // Word type for memcopying - using VolatileWord = typename UnitWord::VolatileWord; - + // Word type for memcpying + using VolatileWord = typename UnitWord::VolatileWord; constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); T retval; VolatileWord* words = reinterpret_cast(&retval); - UnrolledCopy(reinterpret_cast(ptr), words); + UnrolledCopy(reinterpret_cast(ptr), words); return retval; } @@ -356,9 +356,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T +ThreadLoad(const T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { - // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); } @@ -368,25 +368,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*mo template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { - using DeviceWord = typename UnitWord::DeviceWord; - + using DeviceWord = typename UnitWord::DeviceWord; constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); DeviceWord words[DEVICE_MULTIPLE]; - UnrolledThreadLoad( - reinterpret_cast(const_cast(ptr)), words); - + UnrolledThreadLoad(reinterpret_cast(ptr), words); return *reinterpret_cast(words); } -/** - * ThreadLoad definition for generic modifiers - */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr) +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr) { - // Apply tags for partial-specialization - return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); + return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS