NVIDIA · bernhardmgruber · Jan 6, 2026 · Jan 11, 2026 · Jan 11, 2026 · Jan 12, 2026
@@ -114,7 +114,7 @@ struct {0}_proxy_t {{
 struct {0} {{
   using iterator_category = cuda::std::random_access_iterator_tag;
   using difference_type   = DIFF_T;
-  using value_type        = void;
+  using value_type        = VALUE_T;
   using pointer           = {0}_proxy_t*;
   using reference         = {0}_proxy_t;
   __device__ {0}_proxy_t operator*() const {{ return {{state}}; }}

@@ -1,11 +1,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 0:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:2:1
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
 
 #include "common.h"
 
@@ -25,9 +31,9 @@ using element_types =
 #endif
 
 // BabelStream uses 2^25, H200 can fit 2^31 int128s
-// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
-// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
-// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
+// 2^20 chars / 2^16 int128 saturate V100 (min_bytes_in_flight =12 * SM count =80)
+// 2^21 chars / 2^17 int128 saturate A100 (min_bytes_in_flight =16 * SM count =108)
+// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bytes_in_flight =32or48 * SM count =132)
 // inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
 inline auto array_size_powers = nvbench::range(16, 32, 4);
 

@@ -5,14 +5,16 @@
 
 // keep checks at the top so compilation of discarded variants fails really fast
 #include <cub/device/dispatch/dispatch_transform.cuh>
-#if !TUNE_BASE && TUNE_ALGORITHM == 2
+#if !TUNE_BASE
 #  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
 #    error "When tuning, this benchmark does not support being compiled for multiple architectures"
 #  endif
-#  if (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 2 (ublkcp) below sm90"
-#  endif
-#endif
+#  if TUNE_ALGORITHM == 3
+#    if (__CUDA_ARCH_LIST__) < 900
+#      error "Cannot compile algorithm 3 (ublkcp) below sm90"
+#    endif
+#  endif // TUNE_ALGORITHM == 3
+#endif // !TUNE_BASE
 
 #include <cub/util_namespace.cuh>
 
@@ -23,34 +25,51 @@
 
 #include <nvbench_helper.cuh>
 
-template <typename RandomAccessIteratorOut, typename... RandomAccessIteratorsIn>
-#if TUNE_BASE
-using policy_hub_t =
-  cub::detail::transform::policy_hub</* stable address */ false,
-                                     /* dense output */ true,
-                                     ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                                     RandomAccessIteratorOut>;
-#else
-struct policy_hub_t
+#if !TUNE_BASE
+struct policy_selector
 {
-  struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
   {
-    static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    const int min_bytes_in_flight =
+      cub::detail::transform::arch_to_min_bytes_in_flight(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10}) + TUNE_BIF_BIAS;
 #  if TUNE_ALGORITHM == 0
-    static constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
+    constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
+    constexpr auto policy    = cub::detail::transform::prefetch_policy{
+      TUNE_THREADS
+#    ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
+      ,
+      TUNE_ITEMS_PER_THREAD_NO_INPUT
+#    endif // TUNE_ITEMS_PER_THREAD_NO_INPUT
+    };
+    return {min_bytes_in_flight, algorithm, policy, {}, {}};
 #  elif TUNE_ALGORITHM == 1
-    static constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
-#  else
+    constexpr auto algorithm = cub::detail::transform::Algorithm::vectorized;
+    constexpr auto policy    = cub::detail::transform::vectorized_policy{
+      TUNE_THREADS,
+      (1 << TUNE_VEC_SIZE_POW2) * TUNE_VECTORS_PER_THREAD,
+      (1 << TUNE_VEC_SIZE_POW2)
+#    ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
+        ,
+      TUNE_ITEMS_PER_THREAD_NO_INPUT
+#    endif // TUNE_ITEMS_PER_THREAD_NO_INPUT
+    };
+    return {min_bytes_in_flight, algorithm, {}, policy, {}};
+#  elif TUNE_ALGORITHM == 2
+    constexpr auto algorithm = cub::detail::transform::Algorithm::memcpy_async;
+    constexpr auto policy =
+      cub::detail::transform::async_copy_policy{TUNE_THREADS, cub::detail::transform::ldgsts_size_and_align};
+    return {min_bytes_in_flight, algorithm, {}, {}, policy};
+#  elif TUNE_ALGORITHM == 3
+    constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
+    constexpr auto policy    = cub::detail::transform::async_copy_policy{
+      TUNE_THREADS, cub::detail::transform::bulk_copy_alignment(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10})};
+    return {min_bytes_in_flight, algorithm, {}, {}, policy};
+#  else // TUNE_ALGORITHM
 #    error Policy hub does not yet implement the specified value for algorithm
-#  endif
-
-    using algo_policy =
-      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
-                       cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
-                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS, __CUDA_ARCH_LIST__ == 900 ? 128 : 16>>;
-  };
+#  endif // TUNE_ALGORITHM
+  }
 };
-#endif
+#endif // !TUNE_BASE
 
 template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
 void bench_transform(nvbench::state& state,
@@ -60,15 +79,17 @@ void bench_transform(nvbench::state& state,
                      TransformOp transform_op)
 {
   state.exec(nvbench::exec_tag::gpu, [&](const nvbench::launch& launch) {
-    cub::detail::transform::dispatch_t<
-      cub::detail::transform::requires_stable_address::no,
-      OffsetT,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      cub::detail::transform::always_true_predicate,
-      TransformOp,
-      policy_hub_t<RandomAccessIteratorOut, RandomAccessIteratorsIn...>>::
-      dispatch(
-        inputs, output, num_items, cub::detail::transform::always_true_predicate{}, transform_op, launch.get_stream());
+    cub::detail::transform::dispatch<cub::detail::transform::requires_stable_address::no>(
+      inputs,
+      output,
+      num_items,
+      cub::detail::transform::always_true_predicate{},
+      transform_op,
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector{}
+#endif // !TUNE_BASE
+    );
   });
 }
@@ -1,11 +1,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 0:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
 
 #include "common.h"
 

@@ -1,11 +1,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 0:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
 
 #include "common.h"
 

@@ -1,11 +1,25 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// for filling, we can only use the prefetch and the vectorized algorithm
+// %RANGE% TUNE_ALGORITHM alg 0:2:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:3:1
+
+// those parameters only apply if TUNE_ALGORITHM == 0 (prefetch)
+// %RANGE% TUNE_ITEMS_PER_THREAD_NO_INPUT ipt 1:32:1
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 0 && (TUNE_ITEMS_PER_THREAD_NO_INPUT != 1)
+#  error "Non-prefetch algorithms require the no input items per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
 
 #include "common.h"
 

@@ -1,6 +1,18 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 0:4:1
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+
 #include "common.h"
 
 template <typename T>

@@ -1,11 +1,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS bif -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 0:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
+#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
+#  error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
+#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
 
 #include "common.h"