diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index 256c7fb488..a781d68e68 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
-//!       rearranging data partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
+//! rearranging data partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -55,11 +56,10 @@ CUB_NAMESPACE_BEGIN
 //! Overview
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! - It is commonplace for blocks of threads to rearrange data items between
-//!   threads.  For example, the device-accessible memory subsystem prefers access patterns
-//!   where data items are "striped" across threads (where consecutive threads access consecutive items),
-//!   yet most block-wide operations prefer a "blocked" partitioning of items across threads
-//!   (where consecutive items belong to a single thread).
+//! - It is commonplace for blocks of threads to rearrange data items between threads.  For example, the
+//!   device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where
+//!   consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of
+//!   items across threads (where consecutive items belong to a single thread).
 //! - BlockExchange supports the following types of data exchanges:
 //!
 //!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and :ref:`striped <flexible-data-arrangement>`
@@ -76,8 +76,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //! @blockcollective{BlockExchange}
 //!
-//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-//! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items
+//! partitioned across 128 threads where each thread owns 4 items.
 //!
 //! .. code-block:: c++
 //!
@@ -98,9 +98,8 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively exchange data into a blocked arrangement across threads
 //!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
 //!
-//! Suppose the set of striped input ``thread_data`` across the block of threads is
-//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``.
-//! The corresponding output ``thread_data`` in those threads will be
+//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
 //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
 //!
 //! Performance Considerations
@@ -112,33 +111,33 @@ CUB_NAMESPACE_BEGIN
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
 //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
-//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to
-//! the storage required by BlockExchange.
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockExchange.
 //! @endrst
 //!
 //! @tparam T
-//!   The data type to be exchanged
+//!    The data type to be exchanged
 //!
 //! @tparam BLOCK_DIM_X
-//!   The thread block length in threads along the X dimension
+//!    The thread block length in threads along the X dimension
 //!
 //! @tparam ITEMS_PER_THREAD
-//!   The number of items partitioned onto each thread.
+//!    The number of items partitioned onto each thread.
 //!
 //! @tparam WARP_TIME_SLICING
-//!   **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data,
-//!   time-slicing the block-wide exchange over multiple synchronized rounds.
-//!   Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
+//!    **[optional]** When `true`, only use enough shared memory for a single warp's worth of
+//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint
+//! at the expense of decreased parallelism. (Default: false)
 //!
 //! @tparam BLOCK_DIM_Y
-//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!    **[optional]** The thread block length in threads along the Y dimension (default: 1)
 //!
 //! @tparam BLOCK_DIM_Z
-//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!    **[optional]** The thread block length in threads along the Z dimension (default: 1)
 //!
 //! @tparam LEGACY_PTX_ARCH
-//!   <b>[optional]</b> Unused.
-template <typename InputT,
+//!    <b>[optional]</b> Unused.
+template <typename T,
           int BLOCK_DIM_X,
           int ITEMS_PER_THREAD,
           bool WARP_TIME_SLICING = false,
@@ -147,56 +146,42 @@ template <typename InputT,
           int LEGACY_PTX_ARCH    = 0>
 class BlockExchange
 {
-private:
-  /// Constants
-  enum
-  {
-    /// The thread block size in threads
-    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-    LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
-    WARP_THREADS     = 1 << LOG_WARP_THREADS,
-    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-    LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0),
-    SMEM_BANKS     = 1 << LOG_SMEM_BANKS,
-
-    TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-    TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
-
-    TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-    TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-    WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-    WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-    // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
-    // we can typically use 128b loads)
-    INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-    PADDING_ITEMS  = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-  };
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads
+  static constexpr int WARP_THREADS  = CUB_WARP_THREADS(0);
+  static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in
+                                                                                  // C++14
+  static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
+
+  static constexpr int TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int TIME_SLICES         = WARP_TIME_SLICING ? WARPS : 1;
+  static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
+  static constexpr int TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+  static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS);
+  static constexpr int WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+
+  // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
+  // we can typically use 128b loads)
+  static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo<ITEMS_PER_THREAD>::VALUE;
+  static constexpr int PADDING_ITEMS   = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0;
 
   /// Shared memory storage layout type
-  struct __align__(16) _TempStorage
+  struct alignas(16) _TempStorage
   {
-    InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    T buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
   };
 
 public:
   /// @smemstorage{BlockExchange}
-  struct TempStorage : Uninitialized<_TempStorage>
-  {};
+  using TempStorage = Uninitialized<_TempStorage>;
 
 private:
-  /// Shared storage reference
   _TempStorage& temp_storage;
 
-  /// Linear thread-id
-  unsigned int linear_tid;
-  unsigned int lane_id;
-  unsigned int warp_id;
-  unsigned int warp_offset;
+  // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
+  unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+  unsigned int lane_id     = LaneId();
+  unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
+  unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
 
   /// Internal storage allocator
   _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
@@ -205,8 +190,8 @@ private:
     return private_storage;
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for no
+  //!        timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -215,35 +200,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **striped**
-  //!        arrangement. Specialized for warp-timeslicing.
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for
+  //!        warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -252,51 +239,51 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Read a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            temp_items[ITEM] = temp_storage.buff[item_offset];
+            temp_items[i] = temp_storage.buff[item_offset];
           }
         }
       }
@@ -304,14 +291,14 @@ private:
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
-  //!        Specialized for no timeslicing
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for no
+  //!        timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -320,35 +307,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     WARP_SYNC(0xffffffff);
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
-  //!        Specialized for warp-timeslicing
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for
+  //!        warp-timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -357,71 +346,71 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
     if (warp_id == 0)
     {
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-        if (INSERT_PADDING)
+        int item_offset = i + lane_id * ITEMS_PER_THREAD;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
         {
           item_offset += item_offset >> LOG_SMEM_BANKS;
         }
-        detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+        detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
       }
 
       WARP_SYNC(0xffffffff);
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-        if (INSERT_PADDING)
+        int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
         {
           item_offset += item_offset >> LOG_SMEM_BANKS;
         }
-        output_items[ITEM] = temp_storage.buff[item_offset];
+        output_items[i] = temp_storage.buff[item_offset];
       }
     }
 
 #pragma unroll
-    for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+    for (int slice = 1; slice < TIME_SLICES; ++slice)
     {
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-          if (INSERT_PADDING)
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
         WARP_SYNC(0xffffffff);
 
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-          if (INSERT_PADDING)
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          output_items[ITEM] = temp_storage.buff[item_offset];
+          output_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
   }
 
-  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -430,36 +419,38 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 // No timeslicing
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for
+  //!        warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -468,67 +459,67 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
     // Warp time-slicing
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Write a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+            detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
           }
         }
       }
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          temp_items[ITEM] = temp_storage.buff[item_offset];
+          temp_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
-  //!        Specialized for no timeslicing
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -537,35 +528,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     WARP_SYNC(0xffffffff);
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(output_items + ITEM, temp_storage.buff[item_offset]);
+      detail::uninitialized_copy_single(output_items + i, temp_storage.buff[item_offset]);
     }
   }
 
-  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
-  //!        Specialized for warp-timeslicing
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for
+  //! warp-timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -574,44 +567,43 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
 #pragma unroll
-    for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+    for (int slice = 0; slice < TIME_SLICES; ++slice)
     {
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-          if (INSERT_PADDING)
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
         WARP_SYNC(0xffffffff);
 
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-          if (INSERT_PADDING)
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          output_items[ITEM] = temp_storage.buff[item_offset];
+          output_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for no timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -623,38 +615,37 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -666,61 +657,60 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT ranks[ITEMS_PER_THREAD],
     Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
       CTA_SYNC();
 
-      const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+      const int slice_offset = TIME_SLICED_ITEMS * slice;
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ranks[ITEM] - SLICE_OFFSET;
-        if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
         {
-          if (INSERT_PADDING)
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          temp_items[ITEM] = temp_storage.buff[item_offset];
+          temp_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **striped** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for no timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -732,38 +722,37 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **striped** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -775,54 +764,54 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ranks[ITEM] - SLICE_OFFSET;
-        if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
         {
-          if (INSERT_PADDING)
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Read a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            temp_items[ITEM] = temp_storage.buff[item_offset];
+            temp_items[i] = temp_storage.buff[item_offset];
           }
         }
       }
@@ -830,9 +819,9 @@ private:
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
@@ -840,29 +829,15 @@ public:
   //! @name Collective constructors
   //! @{
 
-  /**
-   * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-   */
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange()
       : temp_storage(PrivateStorage())
-      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-      , lane_id(LaneId())
-      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
   {}
 
-  /**
-   * @brief Collective constructor using the specified memory allocation as temporary storage.
-   *
-   * @param[in] temp_storage
-   *   Reference to memory allocation having layout type TempStorage
-   */
+  //! @brief Collective constructor using the specified memory allocation as temporary storage.
+  //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-      , lane_id(LaneId())
-      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
   {}
 
   //! @} end member group
@@ -899,10 +874,9 @@ public:
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
   //!
-  //! Suppose the set of striped input ``thread_data`` across the block of threads is
-  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from
-  //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+  //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
+  //! ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -912,7 +886,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -950,11 +924,10 @@ public:
   //!        // Store data striped across block threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
   //!
-  //! Suppose the set of blocked input ``thread_data`` across the block of threads is
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in
-  //! preparation for storing to device-accessible memory.
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in preparation for storing to device-accessible
+  //! memory.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -964,7 +937,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1001,12 +974,11 @@ public:
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
   //!
-  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is
-  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
-  //! after loading from device-accessible memory. (The first 128 items are striped across
-  //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
+  //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
+  //! items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp,
+  //! etc.) The corresponding output ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11],
+  //! ..., [508,509,510,511] }``.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -1016,7 +988,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1056,12 +1028,11 @@ public:
   //!        // Store data striped across warp threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
   //!
-  //! Suppose the set of blocked input ``thread_data`` across the block of threads is
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
-  //! in preparation for storing to device-accessible memory. (The first 128 items are striped
-  //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` in preparation for storing to
+  //! device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128
+  //! items are striped across the second warp, etc.)
   //! @endrst
   //!
   //! @param[in] input_items
@@ -1071,7 +1042,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1099,7 +1070,7 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
@@ -1126,7 +1097,7 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
@@ -1153,35 +1124,35 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      if (ranks[ITEM] >= 0)
+      if (ranks[i] >= 0)
       {
-        temp_storage.buff[item_offset] = input_items[ITEM];
+        temp_storage.buff[item_offset] = input_items[i];
       }
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
@@ -1211,36 +1182,36 @@ public:
   //!   Corresponding flag denoting item validity
   template <typename OutputT, typename OffsetT, typename ValidFlag>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     ValidFlag (&is_valid)[ITEMS_PER_THREAD])
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      if (is_valid[ITEM])
+      if (is_valid[i])
       {
-        temp_storage.buff[item_offset] = input_items[ITEM];
+        temp_storage.buff[item_offset] = input_items[i];
       }
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
@@ -1248,97 +1219,75 @@ public:
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD])
   {
     StripedToBlocked(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD])
   {
     BlockedToStriped(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD])
   {
     WarpStripedToBlocked(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD])
   {
     BlockedToWarpStriped(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  ///
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToBlocked(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToStriped(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToStripedGuarded(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   *
-   * @param[in] is_valid
-   *   Corresponding flag denoting item validity
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  /// @param[in] is_valid
+  ///   Corresponding flag denoting item validity
   template <typename OffsetT, typename ValidFlag>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
-    InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
+    T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
   {
     ScatterToStriped(items, items, ranks, is_valid);
   }
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 87adeb5451..76c073f1b5 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -26,7 +26,7 @@
  *
  ******************************************************************************/
 
-//! @file Operations for reading linear tiles of data into the CUDA thread block.
+//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
 
 #pragma once
 
@@ -54,7 +54,6 @@ CUB_NAMESPACE_BEGIN
 //! Load a linear segment of items into a blocked arrangement across the thread block.
 //!
 //! @blocked
-//!
 //! @endrst
 //!
 //! @tparam T
@@ -63,27 +62,27 @@ CUB_NAMESPACE_BEGIN
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
+//! @param[in] block_src_it
 //!   The thread block's base input iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
 // Load directly in thread-blocked order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
   }
 }
 
@@ -100,31 +99,32 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
-//!   Number of valid items to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+    const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
+    if (src_pos < block_items_end)
     {
-      items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+      dst_items[i] = block_src_it[src_pos];
     }
   }
 }
@@ -143,35 +143,39 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
+//! @param[in] block_src_it
 //!   The thread block's base input iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
-//!   Number of valid items to load
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -179,58 +183,44 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
 //! @brief Internal implementation for load vectorization
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_ptr
+//! @param[in] block_src_ptr
 //!   Input pointer for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  // Biggest memory access word that T is a whole multiple of
-  using DeviceWord = typename UnitWord<T>::DeviceWord;
-
+  // Find biggest memory access word that T is a whole multiple of
+  using device_word_t = typename UnitWord<T>::DeviceWord;
   _CCCL_DIAG_PUSH
 #  if defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
   _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div")
 #  endif // defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
-  enum
-  {
-    TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
-
-    VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4
-                : (TOTAL_WORDS % 2 == 0) ? 2
-                                         : 1,
-
-    VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
-  };
+  constexpr int total_words = static_cast<int>(sizeof(dst_items) / sizeof(device_word_t));
   _CCCL_DIAG_POP
+  constexpr int vector_size        = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1;
+  constexpr int vectors_per_thread = total_words / vector_size;
+  using vector_t                   = typename CubVector<device_word_t, vector_size>::Type;
 
-  // Vector type
-  using Vector = typename CubVector<DeviceWord, VECTOR_SIZE>::Type;
-
-  // Vector items
-  Vector vec_items[VECTORS_PER_THREAD];
-
-  // Aliased input ptr
-  Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
-
-// Load directly in thread-blocked order
+  // Load into an array of vectors in thread-blocked order
+  vector_t vec_items[vectors_per_thread];
+  const vector_t* vec_ptr = reinterpret_cast<const vector_t*>(block_src_ptr) + linear_tid * vectors_per_thread;
 #  pragma unroll
-  for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+  for (int i = 0; i < vectors_per_thread; i++)
   {
-    vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    vec_items[i] = ThreadLoad<MODIFIER>(vec_ptr + i);
   }
 
-// Copy
+// Copy to destination
 #  pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
   }
 }
 
@@ -258,19 +248,19 @@ InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITE
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//!   linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_ptr
-//!   Input pointer for loading from
+//! @param[in] block_src_ptr
+//!   The thread block's base pointer for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!  destination to load data into
 template <typename T, int ITEMS_PER_THREAD>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
 }
 
 //! @} end member group
@@ -293,43 +283,41 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_T
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
   }
 }
 
 namespace detail
 {
-
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT, typename TransformOpT>
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op)
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]);
+    dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
   }
 }
-
 } // namespace detail
 
 //! @rst
@@ -348,32 +336,32 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
-//!   **inferred** The random-access iterator type for input \iterator.
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) +
+//! linear_tid</tt> for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
-//!
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+    const auto src_pos = linear_tid + i * BLOCK_THREADS;
+    if (src_pos < block_items_end)
     {
-      items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+      dst_items[i] = block_src_it[src_pos];
     }
   }
 }
@@ -395,35 +383,39 @@ LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <int BLOCK_THREADS, typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 //! @} end member group
@@ -448,31 +440,31 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
 
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+    new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * CUB_PTX_WARP_THREADS)]);
   }
 }
 
@@ -494,36 +486,37 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
-  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
 
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+    const auto src_pos = warp_offset + tid + (i * CUB_PTX_WARP_THREADS);
+    if (src_pos < block_items_end)
     {
-      new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+      new (&dst_items[i]) T(block_src_it[src_pos]);
     }
   }
 }
@@ -547,42 +540,46 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 //! @} end member group
 
-//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a
-//!        linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data
+//!        from memory into a blocked arrangement across a CUDA thread block.
 enum BlockLoadAlgorithm
 {
   //! @rst
@@ -594,8 +591,8 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! The utilization of memory transactions (coalescing) decreases as the
-  //! access stride between threads increases (i.e., the number items per thread).
+  //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases
+  //! (i.e., the number items per thread).
   //! @endrst
   BLOCK_LOAD_DIRECT,
 
@@ -608,8 +605,7 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! The utilization of memory transactions (coalescing) doesn't depend on
-  //! the number of items per thread.
+  //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread.
   //!
   //! @endrst
   BLOCK_LOAD_STRIPED,
@@ -618,22 +614,20 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read
-  //! from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-  //! For example, ``ld.global.v4.s32`` instructions will be generated
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
+  //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
   //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
   //!
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high until the the
-  //!   access stride between threads (i.e., the number items per thread) exceeds the
-  //!   maximum vector load width (typically 4 items or 64B, whichever is lower).
-  //! - The following conditions will prevent vectorization and loading will fall
-  //!   back to cub::BLOCK_LOAD_DIRECT:
+  //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads
+  //!   (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever
+  //!   is lower).
+  //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
   //!
   //!   - ``ITEMS_PER_THREAD`` is odd
-  //!   - The ``InputIteratorT`` is not a simple pointer type
+  //!   - The ``RandomAccessIterator`` is not a simple pointer type
   //!   - The block input offset is not quadword-aligned
   //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
   //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
@@ -645,16 +639,15 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
-  //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then locally
+  //! transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
   //!
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high regardless
-  //!   of items loaded per thread.
-  //! - The local reordering incurs slightly longer latencies and throughput than the
-  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
   //!
   //! @endrst
   BLOCK_LOAD_TRANSPOSE,
@@ -675,8 +668,8 @@ enum BlockLoadAlgorithm
   //! ++++++++++++++++++++++++++
   //!
   //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
-  //! - The local reordering incurs slightly larger latencies than the
-  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
   //! - Provisions more shared storage, but incurs smaller latencies than the
   //!   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
   //!
@@ -687,10 +680,10 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>`
-  //! of data is read directly from memory and then is locally transposed into a
-  //! :ref:`blocked arrangement <flexible-data-arrangement>`. To reduce the shared memory requirement, only one
-  //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps.
+  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read
+  //! directly from memory and then is locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently
+  //! time-sliced among warps.
   //!
   //! Usage Considerations
   //! ++++++++++++++++++++++++++
@@ -700,10 +693,9 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high regardless
-  //!   of items loaded per thread.
-  //! - Provisions less shared memory temporary storage, but incurs larger
-  //!   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE
+  //!   alternative.
   //!
   //! @endrst
   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
@@ -711,15 +703,15 @@ enum BlockLoadAlgorithm
 
 //! @rst
 //! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
-//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a
-//! CUDA thread block.
+//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread
+//! block.
 //!
 //! Overview
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! - The BlockLoad class provides a single data movement abstraction that can be specialized
-//!   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
-//!   performance policies for different architectures, data types, granularity sizes, etc.
+//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different
+//!   cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures,
+//!   data types, granularity sizes, etc.
 //! - BlockLoad can be optionally specialized by different data movement strategies:
 //!
 //!   #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
@@ -746,10 +738,9 @@ enum BlockLoadAlgorithm
 //!
 //! @blockcollective{BlockLoad}
 //!
-//! The code snippet below illustrates the loading of a linear
-//! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-//! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-//! meaning memory references are efficiently coalesced using a warp-striped access
+//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
 //! pattern (after which items are locally reordered among threads).
 //!
 //! .. code-block:: c++
@@ -768,21 +759,20 @@ enum BlockLoadAlgorithm
 //!        int thread_data[4];
 //!        BlockLoad(temp_storage).Load(d_data, thread_data);
 //!
-//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
-//! The set of ``thread_data`` across the block of threads in those threads will be
-//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
+//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
 //!
 //! Re-using dynamically allocating shared memory
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
-//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
-//! This example can be easily adapted to the storage required by BlockLoad.
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockLoad.
 //!
 //! @endrst
 //!
-//! @tparam InputT
-//!   The data type to read into (which must be convertible from the input iterator's value type).
+//! @tparam T
+// The data type to read into (which must be convertible from the input iterator's value type).
 //!
 //! @tparam BLOCK_DIM_X
 //!   The thread block length in threads along the X dimension
@@ -793,20 +783,15 @@ enum BlockLoadAlgorithm
 //! @tparam ALGORITHM
 //!   **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
 //!
-//! @tparam WARP_TIME_SLICING
-//!   **[optional]** Whether or not only one warp's worth of shared memory should be
-//!   allocated and time-sliced among block-warps during any load-related data transpositions
-//!   (versus each warp having its own storage). (default: false)
-//!
 //! @tparam BLOCK_DIM_Y
 //!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
 //!
 //! @tparam BLOCK_DIM_Z
-//!  **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
 //!
 //! @tparam LEGACY_PTX_ARCH
-//!  **[optional]** Unused.
-template <typename InputT,
+//!   **[optional]** Unused.
+template <typename T,
           int BLOCK_DIM_X,
           int ITEMS_PER_THREAD,
           BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
@@ -815,621 +800,273 @@ template <typename InputT,
           int LEGACY_PTX_ARCH          = 0>
 class BlockLoad
 {
-private:
-  /// Constants
-  enum
-  {
-    /// The thread block size in threads
-    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-  };
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
 
-  /// Load helper
   template <BlockLoadAlgorithm _POLICY, int DUMMY>
-  struct LoadInternal;
+  struct LoadInternal; // helper to dispatch the load algorithm
 
-  /**
-   * BLOCK_LOAD_DIRECT specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectBlocked(linear_tid, block_itr, items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_STRIPED specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_VECTORIZE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     * (attempts vectorization)
-     *
-     * @param[in] block_ptr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    // attempts vectorization (pointer)
+    template <typename>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     * (attempts vectorization)
-     *
-     * @param[in] block_ptr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    // any other iterator, no vectorization
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     *        (attempts vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
+    // attempts vectorization (cache modified iterator)
     template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for opaque input iterators
-     *        (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename _InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
-    {
-      LoadDirectBlocked(linear_tid, block_itr, items);
-    }
-
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    // skips vectorization
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    // skips vectorization
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_TRANSPOSE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
   {
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     * assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /**
-   * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
   {
-    enum
-    {
-      WARP_THREADS = CUB_WARP_THREADS(0)
-    };
-
-    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /**
-   * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
   {
-    enum
-    {
-      WARP_THREADS = CUB_WARP_THREADS(0)
-    };
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
-    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /// Internal load implementation to use
-  using InternalLoad = LoadInternal<ALGORITHM, 0>;
-
-  /// Shared memory storage layout type
+  using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
   using _TempStorage = typename InternalLoad::TempStorage;
 
-  /// Internal storage allocator
+  // Internal storage allocator
   _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
   {
     __shared__ _TempStorage private_storage;
     return private_storage;
   }
 
-  /// Thread reference to shared storage
   _TempStorage& temp_storage;
-
-  /// Linear thread-id
   int linear_tid;
 
 public:
   /// @smemstorage{BlockLoad}
-  struct TempStorage : Uninitialized<_TempStorage>
-  {};
+  using TempStorage = Uninitialized<_TempStorage>;
 
   //! @name Collective constructors
   //! @{
 
-  /**
-   * @brief Collective constructor using a private static allocation of shared memory as temporary
-   *        storage.
-   */
+  /// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
       : temp_storage(PrivateStorage())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
   {}
 
-  /**
-   * @brief Collective constructor using the specified memory allocation as temporary storage.
-   *
-   * @param[in] temp_storage
-   *   Reference to memory allocation having layout type TempStorage
-   */
+  /// @brief Collective constructor using the specified memory allocation as temporary storage.
+  /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
@@ -1448,10 +1085,9 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+  //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
@@ -1470,21 +1106,20 @@ public:
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
+  //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
-  template <typename InputIteratorT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+  //! @param[out] dst_items
+  //!   Destination to load data into
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
   }
 
   //! @rst
@@ -1497,17 +1132,16 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the guarded loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
   //!
   //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
   //!
-  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
   //!    {
   //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
   //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
@@ -1517,32 +1151,32 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
-  //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads
-  //! being unmasked to load portions of valid data (and other items remaining unassigned).
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
+  //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
+  //! with only the first two threads being unmasked to load portions of valid data (and other items remaining
+  //! unassigned).
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
+  //! @param[out] dst_items
+  //!   Destination to load data into
   //!
-  //! @param[in] valid_items
+  //! @param[in] block_items_end
   //!   Number of valid items to load
-  template <typename InputIteratorT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
   }
 
   //! @rst
-  //! Load a linear segment of items from memory, guarded by range, with a fall-back
-  //! assignment of out-of-bound elements
+  //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
   //!
   //! - @blocked
   //! - @smemreuse
@@ -1550,17 +1184,16 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the guarded loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
   //!
   //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
   //!
-  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
   //!    {
   //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
   //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
@@ -1570,35 +1203,34 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
-  //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``
-  //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads
-  //! being unmasked to load portions of valid data (and other items are assigned ``-1``)
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
+  //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
+  //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load
+  //! portions of valid data (and other items are assigned ``-1``)
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
+  //! @param[out] dst_items
+  //!   Destination to load data into
   //!
-  //! @param[in] valid_items
+  //! @param[in] block_items_end
   //!   Number of valid items to load
   //!
   //! @param[in] oob_default
   //!   Default value to assign out-of-bound items
-  template <typename InputIteratorT, typename DefaultT>
+  template <typename RandomAccessIterator, typename DefaultT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };
 
 template <class Policy, class It, class T = cub::detail::value_t<It>>
diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh
index 9a3f01e2c0..326826c0d1 100644
--- a/cub/cub/detail/uninitialized_copy.cuh
+++ b/cub/cub/detail/uninitialized_copy.cuh
@@ -58,6 +58,7 @@ template <typename T,
           typename ::cuda::std::enable_if<::cuda::std::is_trivially_copyable<T>::value, int>::type = 0>
 _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
 {
+  // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better
   *ptr = ::cuda::std::forward<U>(val);
 }
 
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index a572fa5056..3db8d73031 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -102,11 +102,11 @@ enum CacheLoadModifier
  * @tparam MODIFIER
  *   <b>[inferred]</b> CacheLoadModifier enumeration
  *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> Input iterator type \iterator
+ * @tparam RandomAccessIterator
+ *   <b>[inferred]</b> The input's iterator type \iterator
  */
-template <CacheLoadModifier MODIFIER, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT> ThreadLoad(InputIteratorT itr);
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr);
 
 //@}  end member group
 
@@ -125,9 +125,9 @@ struct IterateThreadLoad
     IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
   }
 
-  template <typename InputIteratorT, typename T>
+  template <typename RandomAccessIterator, typename T>
   CUB_DEPRECATED_BECAUSE("Use UnrolledCopy() instead")
-  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT itr, T* vals)
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals)
   {
     vals[COUNT] = itr[COUNT];
     IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
@@ -142,8 +142,8 @@ struct IterateThreadLoad<MAX, MAX>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/)
   {}
 
-  template <typename InputIteratorT, typename T>
-  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT /*itr*/, T* /*vals*/)
+  template <typename RandomAccessIterator, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/)
   {}
 };
 
@@ -309,9 +309,9 @@ _CUB_LOAD_ALL(LOAD_LDG, global.nc)
 /**
  * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
  */
-template <typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT>
-ThreadLoad(InputIteratorT itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
+template <typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator>
+ThreadLoad(RandomAccessIterator itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
 {
   return *itr;
 }
@@ -320,7 +320,8 @@ ThreadLoad(InputIteratorT itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<fal
  * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
   return *ptr;
 }
@@ -329,9 +330,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_DEFAULT> /*mod
  * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<true> /*is_primitive*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<true> /*is_primitive*/)
 {
-  T retval = *reinterpret_cast<volatile T*>(ptr);
+  T retval = *reinterpret_cast<const volatile T*>(ptr);
   return retval;
 }
 
@@ -339,16 +340,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<true
  * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<false> /*is_primitive*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<false> /*is_primitive*/)
 {
-  // Word type for memcopying
-  using VolatileWord = typename UnitWord<T>::VolatileWord;
-
+  // Word type for memcpying
+  using VolatileWord              = typename UnitWord<T>::VolatileWord;
   constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
 
   T retval;
   VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
-  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<volatile VolatileWord*>(ptr), words);
+  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
   return retval;
 }
 
@@ -356,9 +356,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<fals
  * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
-  // Apply tags for partial-specialization
   return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
 }
 
@@ -368,25 +368,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_VOLATILE> /*mo
 template <typename T, int MODIFIER>
 _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
-  using DeviceWord = typename UnitWord<T>::DeviceWord;
-
+  using DeviceWord              = typename UnitWord<T>::DeviceWord;
   constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
 
   DeviceWord words[DEVICE_MULTIPLE];
-  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(
-    reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)), words);
-
+  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(reinterpret_cast<const DeviceWord*>(ptr), words);
   return *reinterpret_cast<T*>(words);
 }
 
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <CacheLoadModifier MODIFIER, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT> ThreadLoad(InputIteratorT itr)
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr)
 {
-  // Apply tags for partial-specialization
-  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<InputIteratorT>::value>());
+  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
 }
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS