diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index a9dbc9efc7..3194fe11e2 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -302,6 +302,9 @@ struct AgentUniqueByKey
 
         CTA_SYNC();
 
+        // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
+        // types
+        #pragma unroll(1)
         for (int item = threadIdx.x;
              item < num_tile_selections;
              item += BLOCK_THREADS)
@@ -626,7 +629,9 @@ struct AgentUniqueByKey
     {
         // Blocks are launched in increasing order, so just assign one tile per block
         int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * ITEMS_PER_TILE;                // Global offset for the current tile
+
+        // Global offset for the current tile
+        OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(ITEMS_PER_TILE);
 
         if (tile_idx < num_tiles - 1)
         {
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 36c0576707..1292f04ad1 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -42,13 +42,14 @@
 #  pragma system_header
 #endif // no system header
 
-#include <iterator>
-#include <stdio.h>
-
+#include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_select_if.cuh>
 #include <cub/device/dispatch/dispatch_unique_by_key.cuh>
 #include <cub/util_deprecated.cuh>
 
+#include <iterator>
+#include <stdio.h>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -837,6 +838,165 @@ struct DeviceSelect
       stream);
   }
 
+  //! @rst
+  //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
+  //! equal-valued keys, only the first key and its value from each run is selectively copied
+  //! to ``d_keys_out`` and ``d_values_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys_in,          d_keys_in    + num_items)``
+  //!   - ``[d_keys_out,         d_keys_out   + *d_num_selected_out)``
+  //!   - ``[d_values_in,        d_values_in  + num_items)``
+  //!   - ``[d_values_out,       d_values_out + *d_num_selected_out)``
+  //!   - ``[d_num_selected_out, d_num_selected_out + 1)``
+  //!
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_keys_out            <-- [0, 2, 9, 5, 8]
+  //!    // d_values_out          <-- [1, 2, 4, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected keys @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected values @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Type of equality_op
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of values
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the output sequence of selected keys
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the output sequence of selected values
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+  //!
+  //! @param[in] equality_op
+  //!   Binary predicate to determine equality
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT,
+            typename EqualityOpT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, //
+      cudaError_t>::type
+    UniqueByKey(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      KeyInputIteratorT d_keys_in,
+      ValueInputIteratorT d_values_in,
+      KeyOutputIteratorT d_keys_out,
+      ValueOutputIteratorT d_values_out,
+      NumSelectedIteratorT d_num_selected_out,
+      NumItemsT num_items,
+      EqualityOpT equality_op,
+      cudaStream_t stream = 0)
+  {
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    return DispatchUniqueByKey<
+      KeyInputIteratorT,
+      ValueInputIteratorT,
+      KeyOutputIteratorT,
+      ValueOutputIteratorT,
+      NumSelectedIteratorT,
+      EqualityOpT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_keys_out,
+                         d_values_out,
+                         d_num_selected_out,
+                         equality_op,
+                         static_cast<OffsetT>(num_items),
+                         stream);
+  }
+
   //! @rst
   //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
   //! equal-valued keys, only the first key and its value from each run is selectively copied
@@ -914,6 +1074,9 @@ struct DeviceSelect
   //! @tparam NumSelectedIteratorT
   //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -947,56 +1110,49 @@ struct DeviceSelect
             typename ValueInputIteratorT,
             typename KeyOutputIteratorT,
             typename ValueOutputIteratorT,
-            typename NumSelectedIteratorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  UniqueByKey(void *d_temp_storage,
-              size_t &temp_storage_bytes,
-              KeyInputIteratorT d_keys_in,
-              ValueInputIteratorT d_values_in,
-              KeyOutputIteratorT d_keys_out,
-              ValueOutputIteratorT d_values_out,
-              NumSelectedIteratorT d_num_selected_out,
-              int num_items,
-              cudaStream_t stream = 0)
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
   {
-    using OffsetT    = int;
-    using EqualityOp = Equality;
-
-    return DispatchUniqueByKey<KeyInputIteratorT,
-                               ValueInputIteratorT,
-                               KeyOutputIteratorT,
-                               ValueOutputIteratorT,
-                               NumSelectedIteratorT,
-                               EqualityOp,
-                               OffsetT>::Dispatch(d_temp_storage,
-                                                  temp_storage_bytes,
-                                                  d_keys_in,
-                                                  d_values_in,
-                                                  d_keys_out,
-                                                  d_values_out,
-                                                  d_num_selected_out,
-                                                  EqualityOp(),
-                                                  num_items,
-                                                  stream);
+    return UniqueByKey(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_keys_out,
+      d_values_out,
+      d_num_selected_out,
+      num_items,
+      Equality{},
+      stream);
   }
 
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyOutputIteratorT,
             typename ValueOutputIteratorT,
-            typename NumSelectedIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  UniqueByKey(void *d_temp_storage,
-              size_t &temp_storage_bytes,
-              KeyInputIteratorT d_keys_in,
-              ValueInputIteratorT d_values_in,
-              KeyOutputIteratorT d_keys_out,
-              ValueOutputIteratorT d_values_out,
-              NumSelectedIteratorT d_num_selected_out,
-              int num_items,
-              cudaStream_t stream,
-              bool debug_synchronous)
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
@@ -1004,15 +1160,17 @@ struct DeviceSelect
                        ValueInputIteratorT,
                        KeyOutputIteratorT,
                        ValueOutputIteratorT,
-                       NumSelectedIteratorT>(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys_in,
-                                             d_values_in,
-                                             d_keys_out,
-                                             d_values_out,
-                                             d_num_selected_out,
-                                             num_items,
-                                             stream);
+                       NumSelectedIteratorT,
+                       NumItemsT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_keys_out,
+      d_values_out,
+      d_num_selected_out,
+      num_items,
+      stream);
   }
 };
 
diff --git a/cub/test/catch2_test_device_select_unique_by_key.cu b/cub/test/catch2_test_device_select_unique_by_key.cu
index d829b9dc12..36dc0ca46f 100644
--- a/cub/test/catch2_test_device_select_unique_by_key.cu
+++ b/cub/test/catch2_test_device_select_unique_by_key.cu
@@ -81,6 +81,23 @@ struct index_to_huge_type_op_t
   }
 };
 
+template <typename ValueT>
+struct index_to_value_t
+{
+  template <typename IndexT>
+  __host__ __device__ __forceinline__ ValueT operator()(IndexT index)
+  {
+    if (static_cast<std::uint64_t>(index) == 4300000000ULL)
+    {
+      return static_cast<ValueT>(1);
+    }
+    else
+    {
+      return static_cast<ValueT>(0);
+    }
+  }
+};
+
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::UniqueByKey, select_unique_by_key);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
@@ -215,15 +232,27 @@ CUB_TEST("DeviceSelect::UniqueByKey does not change input", "[device][select_uni
   REQUIRE(reference_vals == vals_in);
 }
 
+template<typename EqualityOpT>
 struct project_first
 {
+    EqualityOpT equality_op;
     template <typename Tuple>
     __host__ __device__ bool operator()(const Tuple& lhs, const Tuple& rhs) const
     {
-        return thrust::get<0>(lhs) == thrust::get<0>(rhs);
+        return equality_op(thrust::get<0>(lhs), thrust::get<0>(rhs));
     }
 };
 
+template<typename T>
+struct custom_equality_op
+{
+  T div_val;
+  __host__ __device__ __forceinline__ bool operator()(const T& lhs, const T& rhs) const
+  {
+    return (lhs / div_val) == (rhs / div_val);
+  }
+};
+
 CUB_TEST("DeviceSelect::UniqueByKey works with iterators", "[device][select_unique_by_key]", all_types)
 {
   using type     = typename c2h::get<0, TestType>;
@@ -253,7 +282,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with iterators", "[device][select_uniq
   thrust::host_vector<val_type> reference_vals = vals_in;
   const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
   const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
-  const auto boundary  = std::unique(zip_begin, zip_end, project_first{});
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
   REQUIRE((boundary - zip_begin) == num_selected_out[0]);
 
   keys_out.resize(num_selected_out[0]);
@@ -293,7 +322,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with pointers", "[device][select_uniqu
   thrust::host_vector<val_type> reference_vals = vals_in;
   const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
   const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
-  const auto boundary  = std::unique(zip_begin, zip_end, project_first{});
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
   REQUIRE((boundary - zip_begin) == num_selected_out[0]);
 
   keys_out.resize(num_selected_out[0]);
@@ -346,7 +375,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with a different output type", "[devic
   thrust::host_vector<val_type> reference_vals = vals_in;
   const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
   const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
-  const auto boundary  = std::unique(zip_begin, zip_end, project_first{});
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
   REQUIRE((boundary - zip_begin) == num_selected_out[0]);
 
   keys_out.resize(num_selected_out[0]);
@@ -392,7 +421,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types",
 
   const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
   const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
-  const auto boundary  = std::unique(zip_begin, zip_end, project_first{});
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
   REQUIRE((boundary - zip_begin) == num_selected_out[0]);
 
   keys_out.resize(num_selected_out[0]);
@@ -402,3 +431,105 @@ CUB_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types",
   REQUIRE(reference_keys == keys_out);
   REQUIRE(reference_vals == vals_out);
 }
+
+CUB_TEST("DeviceSelect::UniqueByKey works for very large input that need 64-bit offset types",
+         "[device][select_unique_by_key]")
+{
+  using type       = std::int32_t;
+  using index_type = std::int64_t;
+
+  const std::size_t num_items = 4400000000ULL;
+  thrust::host_vector<type> reference_keys{static_cast<type>(0), static_cast<type>(1), static_cast<type>(0)};
+  thrust::host_vector<index_type> reference_values{0, 4300000000ULL, 4300000001ULL};
+
+  auto keys_in   = thrust::make_transform_iterator(thrust::make_counting_iterator(0ULL), index_to_value_t<type>{});
+  auto values_in = thrust::make_counting_iterator(0ULL);
+  thrust::device_vector<type> keys_out(reference_keys.size());
+  thrust::device_vector<index_type> values_out(reference_values.size());
+
+  // Needs to be device accessible
+  thrust::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(keys_in, values_in, keys_out.begin(), values_out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(reference_keys.size() == static_cast<std::size_t>(num_selected_out[0]));
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_values == values_out);
+}
+
+CUB_TEST("DeviceSelect::UniqueByKey works for very large outputs that needs 64-bit offset types",
+         "[device][select_unique_by_key]")
+{
+  using type       = std::int32_t;
+  using index_type = std::int64_t;
+
+  constexpr std::size_t num_items = 4400000000ULL;
+
+  auto keys_in   = thrust::make_counting_iterator(0ULL);
+  auto values_in = thrust::make_counting_iterator(0ULL);
+
+  // Needs to be device accessible
+  thrust::device_vector<index_type> num_selected_out(1, 0);
+  index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(
+    keys_in,
+    values_in,
+    thrust::make_discard_iterator(),
+    thrust::make_discard_iterator(),
+    d_first_num_selected_out,
+    num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_items == static_cast<std::size_t>(num_selected_out[0]));
+}
+
+CUB_TEST("DeviceSelect::UniqueByKey works with a custom equality operator",
+         "[device][select_unique_by_key]")
+{
+  using type        = std::int32_t;
+  using custom_op_t = custom_equality_op<type>;
+  using val_type    = std::uint64_t;
+  using index_type  = std::int64_t;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  auto keys_in        = thrust::make_counting_iterator(static_cast<type>(0));
+  auto values_in      = thrust::make_counting_iterator(0ULL);
+  thrust::device_vector<type> keys_out(num_items);
+  thrust::device_vector<val_type> vals_out(num_items);
+
+  // Needs to be device accessible
+  thrust::device_vector<index_type> num_selected_out(1, 0);
+  index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(
+    keys_in,
+    values_in,
+    keys_out.begin(),
+    vals_out.begin(),
+    d_first_num_selected_out,
+    num_items,
+    custom_op_t{static_cast<type>(8)});
+
+  // Ensure that we create the same output as std
+  thrust::host_vector<type> reference_keys(num_items);
+  thrust::host_vector<val_type> reference_vals(num_items);
+  thrust::copy(keys_in, keys_in + num_items, reference_keys.begin());
+  thrust::copy(values_in, values_in + num_items, reference_vals.begin());
+  const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<custom_op_t>{custom_op_t{static_cast<type>(8)}});
+  REQUIRE((boundary - zip_begin) == static_cast<std::ptrdiff_t>(num_selected_out[0]));
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
\ No newline at end of file
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index 10fb460117..00a4412c86 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -1509,6 +1509,18 @@ struct HugeDataType
     }
   }
 
+  __device__ __host__ HugeDataType& operator=(const HugeDataType& rhs)
+  {
+    if (this != &rhs)
+    {
+      for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+      {
+        data[i] = rhs.data[i];
+      }
+    }
+    return *this;
+  }
+
   int data[ELEMENTS_PER_OBJECT];
 };
 
@@ -1555,3 +1567,22 @@ __device__ __host__ bool operator!=(const HugeDataType<ELEMENTS_PER_OBJECT>& lhs
 
   return false;
 }
+
+
+template <int ELEMENTS_PER_OBJECT>
+std::ostream& 
+operator<<(std::ostream& os, 
+const HugeDataType<ELEMENTS_PER_OBJECT>& val)
+{
+  os << '(';
+  for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+  {
+    os << CoutCast(val.data[i]);
+    if (i < ELEMENTS_PER_OBJECT - 1)
+    {
+      os << ',';
+    }
+  }
+  os << ')';
+  return os;
+}
diff --git a/thrust/testing/unique_by_key.cu b/thrust/testing/unique_by_key.cu
index 76073e0ca3..e6e0dd0af6 100644
--- a/thrust/testing/unique_by_key.cu
+++ b/thrust/testing/unique_by_key.cu
@@ -4,6 +4,22 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
 
+template <typename ValueT>
+struct index_to_value_t
+{
+  template <typename IndexT>
+  __host__ __device__ __forceinline__ ValueT operator()(IndexT index)
+  {
+    if (static_cast<std::uint64_t>(index) == 4300000000ULL)
+    {
+      return static_cast<ValueT>(1);
+    }
+    else
+    {
+      return static_cast<ValueT>(0);
+    }
+  }
+};
 
 template <typename ForwardIterator1,
           typename ForwardIterator2>
@@ -337,6 +353,8 @@ struct TestUniqueCopyByKey
 };
 VariableUnitTest<TestUniqueCopyByKey, IntegralTypes> TestUniqueCopyByKeyInstance;
 
+
+
 template<typename K>
 struct TestUniqueCopyByKeyToDiscardIterator
 {
@@ -436,3 +454,62 @@ struct TestUniqueCopyByKeyToDiscardIterator
 };
 VariableUnitTest<TestUniqueCopyByKeyToDiscardIterator, IntegralTypes> TestUniqueCopyByKeyToDiscardIteratorInstance;
 
+template <typename K>
+struct TestUniqueCopyByKeyLargeInput
+{
+    void operator()()
+    {
+        using type       = K;
+        using index_type = std::int64_t;
+
+        const std::size_t num_items = 4400000000ULL;
+        thrust::host_vector<type> reference_keys{static_cast<type>(0), static_cast<type>(1), static_cast<type>(0)};
+        thrust::host_vector<index_type> reference_values{0, 4300000000ULL, 4300000001ULL};
+
+        auto keys_in = thrust::make_transform_iterator(thrust::make_counting_iterator(0ULL), index_to_value_t<type>{});
+        auto values_in = thrust::make_counting_iterator(0ULL);
+        thrust::device_vector<type> keys_out(reference_keys.size());
+        thrust::device_vector<index_type> values_out(reference_values.size());
+
+        // Run test
+        const auto selected_aut_end = thrust::unique_by_key_copy(
+          keys_in, keys_in + num_items, values_in, keys_out.begin(), values_out.begin());
+
+        // Ensure that we created the correct output
+        auto const num_selected_out = thrust::distance(keys_out.begin(), selected_aut_end.first);
+        ASSERT_EQUAL(reference_keys.size(), static_cast<std::size_t>(num_selected_out));
+        ASSERT_EQUAL(num_selected_out, thrust::distance(values_out.begin(), selected_aut_end.second));
+        keys_out.resize(num_selected_out);
+        values_out.resize(num_selected_out);
+        ASSERT_EQUAL(reference_keys, keys_out);
+        ASSERT_EQUAL(reference_values, values_out);
+    }
+};
+SimpleUnitTest<TestUniqueCopyByKeyLargeInput, IntegralTypes> TestUniqueCopyByKeyLargeInputInstance;
+
+template <typename K>
+struct TestUniqueCopyByKeyLargeOutCount
+{
+    void operator()()
+    {
+        using type       = std::int32_t;
+        using index_type = std::int64_t;
+
+        constexpr std::size_t num_items = 4400000000ULL;
+
+        auto keys_in   = thrust::make_counting_iterator(0ULL);
+        auto values_in = thrust::make_counting_iterator(0ULL);
+
+        // Run test
+        auto keys_out = thrust::make_discard_iterator();
+        auto values_out = thrust::make_discard_iterator();
+        const auto selected_aut_end = thrust::unique_by_key_copy(thrust::device, 
+          keys_in, keys_in + num_items, values_in, keys_out, values_out);
+
+        // Ensure that we created the correct output
+        auto const num_selected_out = thrust::distance(keys_out, selected_aut_end.first);
+        ASSERT_EQUAL(num_items, static_cast<std::size_t>(num_selected_out));
+        ASSERT_EQUAL(num_selected_out, thrust::distance(values_out, selected_aut_end.second));
+    }
+};
+SimpleUnitTest<TestUniqueCopyByKeyLargeOutCount, IntegralTypes> TestUniqueCopyByKeyLargeOutCountInstance;
diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h
index 570d0d7139..30a1855c11 100644
--- a/thrust/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/thrust/system/cuda/detail/unique_by_key.h
@@ -38,890 +38,272 @@
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#include <thrust/detail/alignment.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/distance.h>
-#include <thrust/functional.h>
-#include <thrust/pair.h>
-#include <thrust/system/cuda/config.h>
-#include <thrust/system/cuda/detail/cdp_dispatch.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/get_value.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/system/cuda/detail/util.h>
-
-#include <cub/device/device_select.cuh>
-#include <cub/util_math.cuh>
+#  include <cub/device/device_select.cuh>
+#  include <cub/util_math.cuh>
+
+#  include <thrust/detail/alignment.h>
+#  include <thrust/detail/cstdint.h>
+#  include <thrust/detail/minmax.h>
+#  include <thrust/detail/mpl/math.h>
+#  include <thrust/detail/temporary_array.h>
+#  include <thrust/distance.h>
+#  include <thrust/functional.h>
+#  include <thrust/pair.h>
+#  include <thrust/system/cuda/config.h>
+#  include <thrust/system/cuda/detail/cdp_dispatch.h>
+#  include <thrust/system/cuda/detail/core/agent_launcher.h>
+#  include <thrust/system/cuda/detail/get_value.h>
+#  include <thrust/system/cuda/detail/par_to_seq.h>
+#  include <thrust/system/cuda/detail/util.h>
 
 THRUST_NAMESPACE_BEGIN
 
-template <typename DerivedPolicy,
-          typename ForwardIterator1,
-          typename ForwardIterator2>
-_CCCL_HOST_DEVICE thrust::pair<ForwardIterator1, ForwardIterator2>
-unique_by_key(
-    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-    ForwardIterator1                                            keys_first,
-    ForwardIterator1                                            keys_last,
-    ForwardIterator2                                            values_first);
+template <typename DerivedPolicy, typename ForwardIterator1, typename ForwardIterator2>
+_CCCL_HOST_DEVICE thrust::pair<ForwardIterator1, ForwardIterator2> unique_by_key(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  ForwardIterator1 keys_first,
+  ForwardIterator1 keys_last,
+  ForwardIterator2 values_first);
 template <typename DerivedPolicy,
           typename InputIterator1,
           typename InputIterator2,
           typename OutputIterator1,
           typename OutputIterator2>
-_CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2>
-unique_by_key_copy(
-    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-    InputIterator1                                              keys_first,
-    InputIterator1                                              keys_last,
-    InputIterator2                                              values_first,
-    OutputIterator1                                             keys_result,
-    OutputIterator2                                             values_result);
-
-
-namespace cuda_cub {
-
-// XXX  it should be possible to unify unique & unique_by_key into a single
-//      agent with various specializations, similar to what is done
-//      with partition
-namespace __unique_by_key {
-
-  template <int                     _BLOCK_THREADS,
-            int                     _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
-    };
-    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
-    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
-  };    // struct PtxPolicy
-
-  template<class,class>
-  struct Tuning;
-
-  namespace mpl = thrust::detail::mpl::math;
-
-  template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
-  struct items_per_thread
-  {
-    enum
-    {
-      value = mpl::min<
-          int,
-          static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD),
-          mpl::max<int,
-                   1,
-                   static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 /
-                   sizeof(T))>::value>::value
-    };
-  };
-
-
-  template<class T>
-  struct Tuning<sm52,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 11,
-      //
-      ITEMS_PER_THREAD = items_per_thread<T,
-                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
-    };
-
-    typedef PtxPolicy<64,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_SCAN_WARP_SCANS>
-        type;
-  };    // Tuning for sm52
-
-  template<class T>
-  struct Tuning<sm35,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 9,
-      //
-      ITEMS_PER_THREAD = items_per_thread<T,
-                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_SCAN_WARP_SCANS>
-        type;
-  };    // Tuning for sm35
-
-  template<class T>
-  struct Tuning<sm30,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      //
-      ITEMS_PER_THREAD = items_per_thread<T,
-                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_SCAN_WARP_SCANS>
-        type;
-  };    // Tuning for sm30
-
-  template <class KeyInputIt,
-            class ValInputIt,
-            class KeyOutputIt,
-            class ValOutputIt,
-            class BinaryPred,
-            class Size,
-            class NumSelectedOutIt>
-  struct UniqueByKeyAgent
-  {
-    typedef typename iterator_traits<KeyInputIt>::value_type key_type;
-    typedef typename iterator_traits<ValInputIt>::value_type value_type;
-
-    typedef cub::ScanTileState<Size> ScanTileState;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, key_type>::type
-    {
-      typedef Tuning<Arch, key_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeyInputIt>::type KeyLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, ValInputIt>::type ValLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeyLoadIt>::type BlockLoadKeys;
-      typedef typename core::BlockLoad<PtxPlan, ValLoadIt>::type BlockLoadValues;
-
-      typedef cub::BlockDiscontinuity<key_type,
-                                      PtxPlan::BLOCK_THREADS,
-                                      1,
-                                      1,
-                                      Arch::ver>
-          BlockDiscontinuityKeys;
-
-      typedef cub::TilePrefixCallbackOp<Size,
-                                        cub::Sum,
-                                        ScanTileState,
-                                        Arch::ver>
-          TilePrefixCallback;
-      typedef cub::BlockScan<Size,
-                             PtxPlan::BLOCK_THREADS,
-                             PtxPlan::SCAN_ALGORITHM,
-                             1,
-                             1,
-                             Arch::ver>
-          BlockScan;
-
-      typedef core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE>
-          shared_keys_t;
-      typedef core::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE>
-          shared_values_t;
-
-      union TempStorage
-      {
-        struct ScanStorage
-        {
-          typename BlockScan::TempStorage              scan;
-          typename TilePrefixCallback::TempStorage     prefix;
-          typename BlockDiscontinuityKeys::TempStorage discontinuity;
-        } scan_storage;
-
-        typename BlockLoadKeys::TempStorage   load_keys;
-        typename BlockLoadValues::TempStorage load_values;
-
-        shared_keys_t   shared_keys;
-        shared_values_t shared_values;
-      };    // union TempStorage
-    };    // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeyLoadIt              KeyLoadIt;
-    typedef typename ptx_plan::ValLoadIt              ValLoadIt;
-    typedef typename ptx_plan::BlockLoadKeys          BlockLoadKeys;
-    typedef typename ptx_plan::BlockLoadValues        BlockLoadValues;
-    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
-    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
-    typedef typename ptx_plan::BlockScan              BlockScan;
-    typedef typename ptx_plan::TempStorage            TempStorage;
-    typedef typename ptx_plan::shared_keys_t          shared_keys_t;
-    typedef typename ptx_plan::shared_values_t        shared_values_t;
-
-    enum
-    {
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per-thread fields
-      //---------------------------------------------------------------------
-
-      TempStorage &                      temp_storage;
-      ScanTileState &                    tile_state;
-      KeyLoadIt                          keys_in;
-      ValLoadIt                          values_in;
-      KeyOutputIt                        keys_out;
-      ValOutputIt                        values_out;
-      cub::InequalityWrapper<BinaryPred> predicate;
-      Size                               num_items;
-
-      //---------------------------------------------------------------------
-      // Utility functions
-      //---------------------------------------------------------------------
-
-      struct key_tag {};
-      struct value_tag {};
-
-      THRUST_DEVICE_FUNCTION
-      shared_keys_t &get_shared(key_tag)
-      {
-        return temp_storage.shared_keys;
-      }
-      THRUST_DEVICE_FUNCTION
-      shared_values_t &get_shared(value_tag)
-      {
-        return temp_storage.shared_values;
-      }
-
-
-      template <class Tag,
-                class OutputIt,
-                class T>
-      void THRUST_DEVICE_FUNCTION
-      scatter(Tag      tag,
-              OutputIt items_out,
-              T (&items)[ITEMS_PER_THREAD],
-              Size (&selection_flags)[ITEMS_PER_THREAD],
-              Size (&selection_indices)[ITEMS_PER_THREAD],
-              int  /*num_tile_items*/,
-              int  num_tile_selections,
-              Size num_selections_prefix,
-              Size /*num_selections*/)
-      {
-        using core::sync_threadblock;
-
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int local_scatter_offset = selection_indices[ITEM] -
-                                     num_selections_prefix;
-          if (selection_flags[ITEM])
-          {
-            get_shared(tag)[local_scatter_offset] = items[ITEM];
-          }
-        }
-
-        sync_threadblock();
-
-        for (int item = threadIdx.x;
-             item < num_tile_selections;
-             item += BLOCK_THREADS)
-        {
-          items_out[num_selections_prefix + item] = get_shared(tag)[item];
-        }
-
-        sync_threadblock();
-      }
-
-      //---------------------------------------------------------------------
-      // Tile processing
-      //---------------------------------------------------------------------
-
-      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-      Size THRUST_DEVICE_FUNCTION
-      consume_tile_impl(int  num_tile_items,
-                        int  tile_idx,
-                        Size tile_base)
-      {
-        using core::sync_threadblock;
-
-        key_type keys[ITEMS_PER_THREAD];
-        Size     selection_flags[ITEMS_PER_THREAD];
-        Size     selection_idx[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-          // Fill last elements with the first element
-          // because collectives are not suffix guarded
-          BlockLoadKeys(temp_storage.load_keys)
-              .Load(keys_in + tile_base,
-                    keys,
-                    num_tile_items,
-                    *(keys_in + tile_base));
-        }
-        else
-        {
-          BlockLoadKeys(temp_storage.load_keys).Load(keys_in + tile_base, keys);
-        }
-
-
-        sync_threadblock();
-
-        value_type values[ITEMS_PER_THREAD];
-        if (IS_LAST_TILE)
-        {
-          // Fill last elements with the first element
-          // because collectives are not suffix guarded
-          BlockLoadValues(temp_storage.load_values)
-              .Load(values_in + tile_base,
-                    values,
-                    num_tile_items,
-                    *(values_in + tile_base));
-        }
-        else
-        {
-          BlockLoadValues(temp_storage.load_values)
-              .Load(values_in + tile_base, values);
-        }
-
-        sync_threadblock();
-
-        if (IS_FIRST_TILE)
-        {
-          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
-              .FlagHeads(selection_flags, keys, predicate);
-        }
-        else
-        {
-          key_type tile_predecessor = keys_in[tile_base - 1];
-          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
-              .FlagHeads(selection_flags, keys, predicate, tile_predecessor);
-        }
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          // Set selection_flags for out-of-bounds items
-          if ((IS_LAST_TILE) && (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
-            selection_flags[ITEM] = 1;
-        }
-
-        sync_threadblock();
-
-
-        Size num_tile_selections   = 0;
-        Size num_selections        = 0;
-        Size num_selections_prefix = 0;
-        if (IS_FIRST_TILE)
-        {
-          BlockScan(temp_storage.scan_storage.scan)
-              .ExclusiveSum(selection_flags,
-                            selection_idx,
-                            num_tile_selections);
-
-          if (threadIdx.x == 0)
-          {
-            // Update tile status if this is not the last tile
-            if (!IS_LAST_TILE)
-              tile_state.SetInclusive(0, num_tile_selections);
-          }
-
-          // Do not count any out-of-bounds selections
-          if (IS_LAST_TILE)
-          {
-            int num_discount = ITEMS_PER_TILE - num_tile_items;
-            num_tile_selections -= num_discount;
-          }
-          num_selections = num_tile_selections;
-        }
-        else
-        {
-          TilePrefixCallback prefix_cb(tile_state,
-                                       temp_storage.scan_storage.prefix,
-                                       cub::Sum(),
-                                       tile_idx);
-          BlockScan(temp_storage.scan_storage.scan)
-              .ExclusiveSum(selection_flags,
-                            selection_idx,
-                            prefix_cb);
-
-          num_selections        = prefix_cb.GetInclusivePrefix();
-          num_tile_selections   = prefix_cb.GetBlockAggregate();
-          num_selections_prefix = prefix_cb.GetExclusivePrefix();
-
-          if (IS_LAST_TILE)
-          {
-            int num_discount = ITEMS_PER_TILE - num_tile_items;
-            num_tile_selections -= num_discount;
-            num_selections -= num_discount;
-          }
-        }
-
-        sync_threadblock();
-
-        scatter(key_tag(),
-                keys_out,
-                keys,
-                selection_flags,
-                selection_idx,
-                num_tile_items,
-                num_tile_selections,
-                num_selections_prefix,
-                num_selections);
-
-        sync_threadblock();
-
-        scatter(value_tag(),
-                values_out,
-                values,
-                selection_flags,
-                selection_idx,
-                num_tile_items,
-                num_tile_selections,
-                num_selections_prefix,
-                num_selections);
-
-        return num_selections;
-      }
-
-
-      template <bool IS_LAST_TILE>
-      Size THRUST_DEVICE_FUNCTION
-      consume_tile(int  num_tile_items,
-                   int  tile_idx,
-                   Size tile_base)
-      {
-        if (tile_idx == 0)
-        {
-          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
-                                                       tile_idx,
-                                                       tile_base);
-        }
-        else
-        {
-          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
-                                                        tile_idx,
-                                                        tile_base);
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage &    temp_storage_,
-           ScanTileState &  tile_state_,
-           KeyLoadIt        keys_in_,
-           ValLoadIt        values_in_,
-           KeyOutputIt      keys_out_,
-           ValOutputIt      values_out_,
-           BinaryPred       binary_pred_,
-           Size             num_items_,
-           int              num_tiles,
-           NumSelectedOutIt num_selected_out)
-          // filed ctors
-          : temp_storage(temp_storage_),
-            tile_state(tile_state_),
-            keys_in(keys_in_),
-            values_in(values_in_),
-            keys_out(keys_out_),
-            values_out(values_out_),
-            predicate(binary_pred_),
-            num_items(num_items_)
-      {
-        int  tile_idx  = blockIdx.x;
-        Size tile_base = tile_idx * ITEMS_PER_TILE;
-
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<false>(ITEMS_PER_TILE,
-                              tile_idx,
-                              tile_base);
-        }
-        else
-        {
-          int  num_remaining  = static_cast<int>(num_items - tile_base);
-          Size num_selections = consume_tile<true>(num_remaining,
-                                                   tile_idx,
-                                                   tile_base);
-          if (threadIdx.x == 0)
-          {
-            *num_selected_out = num_selections;
-          }
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(KeyInputIt       keys_in,
-                       ValInputIt       values_in,
-                       KeyOutputIt      keys_out,
-                       ValOutputIt      values_out,
-                       BinaryPred       binary_pred,
-                       NumSelectedOutIt num_selected_out,
-                       Size             num_items,
-                       ScanTileState    tile_state,
-                       int              num_tiles,
-                       char *           shmem)
-    {
-      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
-
-      impl(storage,
-           tile_state,
-           core::make_load_iterator(ptx_plan(), keys_in),
-           core::make_load_iterator(ptx_plan(), values_in),
-           keys_out,
-           values_out,
-           binary_pred,
-           num_items,
-           num_tiles,
-           num_selected_out);
-    }
-  }; // struct UniqueByKeyAgent
+_CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> unique_by_key_copy(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  InputIterator1 keys_first,
+  InputIterator1 keys_last,
+  InputIterator2 values_first,
+  OutputIterator1 keys_result,
+  OutputIterator2 values_result);
+
+namespace cuda_cub
+{
 
+namespace detail
+{
 
-  template <class ScanTileState,
-            class NumSelectedIt,
-            class Size>
-  struct InitAgent
+template <typename Derived,
+          typename KeyInputIt,
+          typename ValInputIt,
+          typename KeyOutputIt,
+          typename ValOutputIt,
+          typename BinaryPred,
+          typename OffsetT>
+struct DispatchUniqueByKey
+{
+  static cudaError_t THRUST_RUNTIME_FUNCTION dispatch(
+    execution_policy<Derived>& policy,
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIt keys_in,
+    ValInputIt values_in,
+    KeyOutputIt keys_out,
+    ValOutputIt values_out,
+    OffsetT num_items,
+    BinaryPred binary_pred,
+    pair<KeyOutputIt, ValOutputIt>& result_end)
   {
-    template <class Arch>
-    struct PtxPlan : PtxPolicy<128> {};
-
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
+    cudaError_t status         = cudaSuccess;
+    cudaStream_t stream        = cuda_cub::stream(policy);
+    size_t allocation_sizes[2] = {0, sizeof(OffsetT)};
+    void* allocations[2]       = {nullptr, nullptr};
+
+    // Query algorithm memory requirements
+    status = cub::DeviceSelect::UniqueByKey(
+      nullptr,
+      allocation_sizes[0],
+      keys_in,
+      values_in,
+      keys_out,
+      values_out,
+      static_cast<OffsetT*>(nullptr),
+      num_items,
+      stream);
+    CUDA_CUB_RET_IF_FAIL(status);
 
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
+    status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
 
-    THRUST_AGENT_ENTRY(ScanTileState tile_state,
-                       Size          num_tiles,
-                       NumSelectedIt num_selected_out,
-                       char * /*shmem*/)
+    // Return if we're only querying temporary storage requirements
+    if (d_temp_storage == nullptr)
     {
-      tile_state.InitializeStatus(num_tiles);
-      if (blockIdx.x == 0 && threadIdx.x == 0)
-        *num_selected_out = 0;
+      return status;
     }
 
-  }; // struct InitAgent
-
-
-  template <class KeyInputIt,
-            class ValInputIt,
-            class KeyOutputIt,
-            class ValOutputIt,
-            class BinaryPred,
-            class Size,
-            class NumSelectedOutIt>
-  static cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_step(void *           d_temp_storage,
-            size_t &         temp_storage_bytes,
-            KeyInputIt       keys_in,
-            ValInputIt       values_in,
-            KeyOutputIt      keys_out,
-            ValOutputIt      values_out,
-            BinaryPred       binary_pred,
-            NumSelectedOutIt num_selected_out,
-            Size             num_items,
-            cudaStream_t     stream)
-  {
-    using core::AgentLauncher;
-    using core::AgentPlan;
-    using core::get_agent_plan;
-
-    typedef AgentLauncher<
-        UniqueByKeyAgent<KeyInputIt,
-                         ValInputIt,
-                         KeyOutputIt,
-                         ValOutputIt,
-                         BinaryPred,
-                         Size,
-                         NumSelectedOutIt> >
-        unique_agent;
-
-    typedef typename unique_agent::ScanTileState ScanTileState;
-
-    typedef AgentLauncher<
-        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
-        init_agent;
-
-    using core::get_plan;
-    typename get_plan<init_agent>::type   init_plan   = init_agent::get_plan();
-    typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
-
-
-    int tile_size = unique_plan.items_per_tile;
-    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
-
-    size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
-                                           num_tiles);
-
-    cudaError_t status = cudaSuccess;
-    size_t      allocation_sizes[2] = {0, vshmem_size};
-    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    void *allocations[2] = {NULL, NULL};
-    //
-    status = cub::AliasTemporaries(d_temp_storage,
-                                   temp_storage_bytes,
-                                   allocations,
-                                   allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
+    // Return for empty problems
+    if (num_items == 0)
     {
+      result_end = thrust::make_pair(keys_out, values_out);
       return status;
     }
 
-    ScanTileState tile_status;
-    status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    // Memory allocation for the number of selected output items
+    OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
+
+    // Run algorithm
+    status = cub::DeviceSelect::UniqueByKey(
+      allocations[0],
+      allocation_sizes[0],
+      keys_in,
+      values_in,
+      keys_out,
+      values_out,
+      d_num_selected_out,
+      num_items,
+      binary_pred,
+      stream);
     CUDA_CUB_RET_IF_FAIL(status);
 
-    num_tiles = max<size_t>(1,num_tiles);
-    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
-    ia.launch(tile_status, num_tiles, num_selected_out);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    if (num_items == 0) { return status; }
-
-    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
-
-    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent");
-    ua.launch(keys_in,
-              values_in,
-              keys_out,
-              values_out,
-              binary_pred,
-              num_selected_out,
-              num_items,
-              tile_status,
-              num_tiles);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    return status;
-  }
-
-  template <typename Derived,
-            typename KeyInputIt,
-            typename ValInputIt,
-            typename KeyOutputIt,
-            typename ValOutputIt,
-            typename BinaryPred>
-  THRUST_RUNTIME_FUNCTION
-  pair<KeyOutputIt, ValOutputIt>
-  unique_by_key(execution_policy<Derived>& policy,
-                KeyInputIt                 keys_first,
-                KeyInputIt                 keys_last,
-                ValInputIt                 values_first,
-                KeyOutputIt                keys_result,
-                ValOutputIt                values_result,
-                BinaryPred                 binary_pred)
-  {
-
-    typedef int size_type;
-
-    size_type num_items
-      = static_cast<size_type>(thrust::distance(keys_first, keys_last));
-
-    size_t       temp_storage_bytes = 0;
-    cudaStream_t stream             = cuda_cub::stream(policy);
-
-    cudaError_t status;
-    status = __unique_by_key::doit_step(NULL,
-                                        temp_storage_bytes,
-                                        keys_first,
-                                        values_first,
-                                        keys_result,
-                                        values_result,
-                                        binary_pred,
-                                        reinterpret_cast<size_type*>(NULL),
-                                        num_items,
-                                        stream);
-    cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step");
-
-    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
-    void * allocations[2]      = {NULL, NULL};
-
-    size_t storage_size = 0;
-    status = core::alias_storage(NULL,
-                                 storage_size,
-                                 allocations,
-                                 allocation_sizes);
-    cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage");
-
-    // Allocate temporary storage.
-    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
-      tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
-
-    status = core::alias_storage(ptr,
-                                 storage_size,
-                                 allocations,
-                                 allocation_sizes);
-    cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage");
-
-    size_type* d_num_selected_out
-      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
-
-    status = __unique_by_key::doit_step(allocations[1],
-                                        temp_storage_bytes,
-                                        keys_first,
-                                        values_first,
-                                        keys_result,
-                                        values_result,
-                                        binary_pred,
-                                        d_num_selected_out,
-                                        num_items,
-                                        stream);
-    cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step");
-
+    // Get number of selected items
     status = cuda_cub::synchronize(policy);
-    cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize");
-
-    size_type num_selected = get_value(policy, d_num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(status);
+    OffsetT num_selected = get_value(policy, d_num_selected_out);
 
-    return thrust::make_pair(
-      keys_result + num_selected,
-      values_result + num_selected
-    );
+    result_end = thrust::make_pair(keys_out + num_selected, values_out + num_selected);
+    return status;
   }
+};
+
+template <typename Derived,
+          typename KeyInputIt,
+          typename ValInputIt,
+          typename KeyOutputIt,
+          typename ValOutputIt,
+          typename BinaryPred>
+THRUST_RUNTIME_FUNCTION pair<KeyOutputIt, ValOutputIt> unique_by_key(
+  execution_policy<Derived>& policy,
+  KeyInputIt keys_first,
+  KeyInputIt keys_last,
+  ValInputIt values_first,
+  KeyOutputIt keys_result,
+  ValOutputIt values_result,
+  BinaryPred binary_pred)
+{
+  using size_type = typename iterator_traits<KeyInputIt>::difference_type;
+
+  size_type num_items = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+  pair<KeyOutputIt, ValOutputIt> result_end{};
+  cudaError_t status        = cudaSuccess;
+  size_t temp_storage_bytes = 0;
+
+  // 32-bit offset-type dispatch
+  using dispatch32_t =
+    DispatchUniqueByKey<Derived, KeyInputIt, ValInputIt, KeyOutputIt, ValOutputIt, BinaryPred, thrust::detail::uint32_t>;
+
+  // 64-bit offset-type dispatch
+  using dispatch64_t =
+    DispatchUniqueByKey<Derived, KeyInputIt, ValInputIt, KeyOutputIt, ValOutputIt, BinaryPred, thrust::detail::uint64_t>;
+
+  // Query temporary storage requirements
+  THRUST_INDEX_TYPE_DISPATCH2(
+    status,
+    dispatch32_t::dispatch,
+    dispatch64_t::dispatch,
+    num_items,
+    (policy,
+     nullptr,
+     temp_storage_bytes,
+     keys_first,
+     values_first,
+     keys_result,
+     values_result,
+     num_items_fixed,
+     binary_pred,
+     result_end));
+  cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step");
+
+  // Allocate temporary storage.
+  thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp(policy, temp_storage_bytes);
+  void* temp_storage = static_cast<void*>(tmp.data().get());
+
+  // Run algorithm
+  THRUST_INDEX_TYPE_DISPATCH2(
+    status,
+    dispatch32_t::dispatch,
+    dispatch64_t::dispatch,
+    num_items,
+    (policy,
+     temp_storage,
+     temp_storage_bytes,
+     keys_first,
+     values_first,
+     keys_result,
+     values_result,
+     num_items_fixed,
+     binary_pred,
+     result_end));
+  cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step");
+
+  return result_end;
+}
 
-} // namespace __unique_by_key
-
+} // namespace detail
 
 //-------------------------
 // Thrust API entry points
 //-------------------------
-
-
 _CCCL_EXEC_CHECK_DISABLE
-template <class Derived,
-          class KeyInputIt,
-          class ValInputIt,
-          class KeyOutputIt,
-          class ValOutputIt,
-          class BinaryPred>
-pair<KeyOutputIt, ValOutputIt> _CCCL_HOST_DEVICE
-unique_by_key_copy(execution_policy<Derived> &policy,
-                   KeyInputIt                 keys_first,
-                   KeyInputIt                 keys_last,
-                   ValInputIt                 values_first,
-                   KeyOutputIt                keys_result,
-                   ValOutputIt                values_result,
-                   BinaryPred                 binary_pred)
+template <class Derived, class KeyInputIt, class ValInputIt, class KeyOutputIt, class ValOutputIt, class BinaryPred>
+pair<KeyOutputIt, ValOutputIt> _CCCL_HOST_DEVICE unique_by_key_copy(
+  execution_policy<Derived>& policy,
+  KeyInputIt keys_first,
+  KeyInputIt keys_last,
+  ValInputIt values_first,
+  KeyOutputIt keys_result,
+  ValOutputIt values_result,
+  BinaryPred binary_pred)
 {
   auto ret = thrust::make_pair(keys_result, values_result);
   THRUST_CDP_DISPATCH(
-    (ret = __unique_by_key::unique_by_key(policy,
-                                          keys_first,
-                                          keys_last,
-                                          values_first,
-                                          keys_result,
-                                          values_result,
-                                          binary_pred);),
-    (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)),
-                                      keys_first,
-                                      keys_last,
-                                      values_first,
-                                      keys_result,
-                                      values_result,
-                                      binary_pred);));
+    (ret = detail::unique_by_key(policy, keys_first, keys_last, values_first, keys_result, values_result, binary_pred);),
+    (ret = thrust::unique_by_key_copy(
+       cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values_first, keys_result, values_result, binary_pred);));
   return ret;
 }
 
-template <class Derived,
-          class KeyInputIt,
-          class ValInputIt,
-          class KeyOutputIt,
-          class ValOutputIt>
-pair<KeyOutputIt, ValOutputIt> _CCCL_HOST_DEVICE
-unique_by_key_copy(execution_policy<Derived> &policy,
-                   KeyInputIt                 keys_first,
-                   KeyInputIt                 keys_last,
-                   ValInputIt                 values_first,
-                   KeyOutputIt                keys_result,
-                   ValOutputIt                values_result)
+template <class Derived, class KeyInputIt, class ValInputIt, class KeyOutputIt, class ValOutputIt>
+pair<KeyOutputIt, ValOutputIt> _CCCL_HOST_DEVICE unique_by_key_copy(
+  execution_policy<Derived>& policy,
+  KeyInputIt keys_first,
+  KeyInputIt keys_last,
+  ValInputIt values_first,
+  KeyOutputIt keys_result,
+  ValOutputIt values_result)
 {
   typedef typename iterator_traits<KeyInputIt>::value_type key_type;
-  return cuda_cub::unique_by_key_copy(policy,
-                                   keys_first,
-                                   keys_last,
-                                   values_first,
-                                   keys_result,
-                                   values_result,
-                                   equal_to<key_type>());
+  return cuda_cub::unique_by_key_copy(
+    policy, keys_first, keys_last, values_first, keys_result, values_result, equal_to<key_type>());
 }
 
-template <class Derived,
-          class KeyInputIt,
-          class ValInputIt,
-          class BinaryPred>
-pair<KeyInputIt, ValInputIt> _CCCL_HOST_DEVICE
-unique_by_key(execution_policy<Derived> &policy,
-              KeyInputIt                 keys_first,
-              KeyInputIt                 keys_last,
-              ValInputIt                 values_first,
-              BinaryPred                 binary_pred)
+template <class Derived, class KeyInputIt, class ValInputIt, class BinaryPred>
+pair<KeyInputIt, ValInputIt> _CCCL_HOST_DEVICE unique_by_key(
+  execution_policy<Derived>& policy,
+  KeyInputIt keys_first,
+  KeyInputIt keys_last,
+  ValInputIt values_first,
+  BinaryPred binary_pred)
 {
   auto ret = thrust::make_pair(keys_first, values_first);
   THRUST_CDP_DISPATCH(
-    (ret = cuda_cub::unique_by_key_copy(policy,
-                                         keys_first,
-                                         keys_last,
-                                         values_first,
-                                         keys_first,
-                                         values_first,
-                                         binary_pred);),
-    (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)),
-                                  keys_first,
-                                  keys_last,
-                                  values_first,
-                                  binary_pred);));
+    (ret = cuda_cub::unique_by_key_copy(
+       policy, keys_first, keys_last, values_first, keys_first, values_first, binary_pred);),
+    (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values_first, binary_pred);));
   return ret;
 }
 
-template <class Derived,
-          class KeyInputIt,
-          class ValInputIt>
+template <class Derived, class KeyInputIt, class ValInputIt>
 pair<KeyInputIt, ValInputIt> _CCCL_HOST_DEVICE
-unique_by_key(execution_policy<Derived> &policy,
-              KeyInputIt                 keys_first,
-              KeyInputIt                 keys_last,
-              ValInputIt                 values_first)
+unique_by_key(execution_policy<Derived>& policy, KeyInputIt keys_first, KeyInputIt keys_last, ValInputIt values_first)
 {
   typedef typename iterator_traits<KeyInputIt>::value_type key_type;
-  return cuda_cub::unique_by_key(policy,
-                              keys_first,
-                              keys_last,
-                              values_first,
-                              equal_to<key_type>());
+  return cuda_cub::unique_by_key(policy, keys_first, keys_last, values_first, equal_to<key_type>());
 }
 
-
-
-}    // namespace cuda_cub
+} // namespace cuda_cub
 THRUST_NAMESPACE_END
 
-#include <thrust/memory.h>
-#include <thrust/unique.h>
+#  include <thrust/memory.h>
+#  include <thrust/unique.h>
 
 #endif