NVIDIA · bernhardmgruber · Jan 11, 2026 · Jan 7, 2026 · Jan 9, 2026 · Jan 9, 2026
@@ -223,7 +223,7 @@ CUresult cccl_device_reduce_build_ex(
 
     const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
-    const auto cub_arch_policies = [&] {
+    const auto policy_sel = [&] {
       using namespace cub::detail::reduce;
 
       auto accum_type = accum_type::other;
@@ -251,15 +251,15 @@ CUresult cccl_device_reduce_build_ex(
       }
 
       const int offset_size = int{sizeof(OffsetT)};
-      return arch_policies{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
+      return policy_selector{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
     }();
 
     // TODO(bgruber): drop this if tuning policies become formattable
-    std::stringstream cub_arch_policies_str;
-    cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+    std::stringstream policy_sel_str;
+    policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
 
     auto policy_hub_expr =
-      std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
+      std::format("cub::detail::reduce::policy_selector_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
 
     std::string final_src = std::format(
       R"XXX(
@@ -284,7 +284,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
       output_iterator_src, // 4
       op_src, // 5
       policy_hub_expr, // 6
-      cub_arch_policies_str.view()); // 7
+      policy_sel_str.view()); // 7
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -371,7 +371,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
     build->cubin_size       = result.size;
     build->accumulator_size = accum_t.size;
     build->determinism      = determinism;
-    build->runtime_policy   = new cub::detail::reduce::arch_policies{cub_arch_policies};
+    build->runtime_policy   = new cub::detail::reduce::policy_selector{policy_sel};
   }
   catch (const std::exception& exc)
   {
@@ -423,7 +423,7 @@ CUresult cccl_device_reduce(
       indirect_arg_t{init},
       stream,
       ::cuda::std::identity{},
-      *static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
+      *static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
       reduce::reduce_kernel_source{build},
       cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
 
@@ -478,7 +478,7 @@ CUresult cccl_device_reduce_nondeterministic(
       indirect_arg_t{init},
       stream,
       ::cuda::std::identity{},
-      *static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
+      *static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
       reduce::reduce_kernel_source{build},
       cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
 
@@ -512,7 +512,7 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr
 
     using namespace cub::detail::reduce;
     std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
-    std::unique_ptr<arch_policies> policy(static_cast<arch_policies*>(build_ptr->runtime_policy));
+    std::unique_ptr<policy_selector> policy(static_cast<policy_selector*>(build_ptr->runtime_policy));
     check(cuLibraryUnload(build_ptr->library));
   }
   catch (const std::exception& exc)

@@ -154,6 +154,9 @@ function(cccl_build_compiler_targets)
     # See https://github.com/microsoft/STL/issues/696
     append_option_if_available("/wd4494" cxx_compile_options)
 
+    # Get error messages with a little arrow indicating the error location more exactly
+    append_option_if_available("/diagnostics:caret" cxx_compile_options)
+
     if (MSVC_TOOLSET_VERSION LESS 143)
       # winbase.h(9572): warning C5105: macro expansion producing 'defined' has undefined behavior
       append_option_if_available("/wd5105" cxx_compile_options)

@@ -11,9 +11,9 @@
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -60,7 +60,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       ::cuda::std::execution::prop{
         ::cuda::execution::__get_tuning_t,
         ::cuda::std::execution::env{
-          ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}}
+          ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_selector{}}}}
 #  endif
     };
     static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
@@ -84,7 +84,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op
 #if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #endif
   );
 
@@ -104,7 +104,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op
 #if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #endif
     );
   });

@@ -13,9 +13,9 @@
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -24,7 +24,7 @@ struct arch_policies
       1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
       cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
       cub::LOAD_DEFAULT};
-    return {{}, {}, {}, policy}; // Only reduce_nondeterministic_policy is used
+    return {{}, {}, {}, policy}; // Only reduce_nondeterministic is used
   }
 };
 #endif // !TUNE_BASE
@@ -66,7 +66,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op
 #if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #endif
   );
 
@@ -86,7 +86,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op
 #if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #endif
     );
   });

@@ -14,9 +14,9 @@
 #endif
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -133,7 +133,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op_t{}
 #  if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #  endif
   );
 
@@ -153,7 +153,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op_t{}
 #  if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #  endif
     );
   });

@@ -147,7 +147,7 @@ private:
     using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
       TuningEnvT,
       detail::reduce::get_tuning_query_t,
-      detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
+      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
 
     return detail::reduce::dispatch<accum_t>(
       d_temp_storage,
@@ -221,7 +221,7 @@ private:
     using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
       TuningEnvT,
       detail::reduce::get_tuning_query_t,
-      detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
+      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
 
     return detail::reduce::dispatch_nondeterministic<accum_t>(
       d_temp_storage,

@@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::reduce
 {
-template <typename ArchPolicies,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename OffsetT,
@@ -55,12 +55,12 @@ template <typename ArchPolicies,
           typename TransformOpT>
 struct DeviceReduceKernelSource
 {
-  // ArchPolicies must be stateless, so we can pass the type to the kernel
-  static_assert(::cuda::std::is_empty_v<ArchPolicies>);
+  // PolicySelector must be stateless, so we can pass the type to the kernel
+  static_assert(::cuda::std::is_empty_v<PolicySelector>);
 
   CUB_DEFINE_KERNEL_GETTER(
     SingleTileKernel,
-    DeviceReduceSingleTileKernel<ArchPolicies,
+    DeviceReduceSingleTileKernel<PolicySelector,
                                  InputIteratorT,
                                  OutputIteratorT,
                                  OffsetT,
@@ -70,11 +70,11 @@ struct DeviceReduceKernelSource
                                  TransformOpT>)
 
   CUB_DEFINE_KERNEL_GETTER(
-    ReductionKernel, DeviceReduceKernel<ArchPolicies, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
+    ReductionKernel, DeviceReduceKernel<PolicySelector, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
 
   CUB_DEFINE_KERNEL_GETTER(
     SingleTileSecondKernel,
-    DeviceReduceSingleTileKernel<ArchPolicies,
+    DeviceReduceSingleTileKernel<PolicySelector,
                                  AccumT*,
                                  OutputIteratorT,
                                  int, // Always used with int offsets
@@ -90,15 +90,15 @@ struct DeviceReduceKernelSource
 
 // TODO(bgruber): remove in CCCL 4.0
 template <typename PolicyHub>
-struct arch_policies_from_hub
+struct policy_selector_from_hub
 {
   // this is only called in device code, so we can ignore the arch parameter
-  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_arch_policy
+  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_policy
   {
     using ap             = typename PolicyHub::MaxPolicy::ActivePolicy;
     using ap_reduce      = typename ap::ReducePolicy;
     using ap_single_tile = typename ap::SingleTilePolicy;
-    return reduce_arch_policy{
+    return reduce_policy{
       agent_reduce_policy{
         ap_reduce::BLOCK_THREADS,
         ap_reduce::ITEMS_PER_THREAD,
@@ -153,7 +153,7 @@ template <typename InputIteratorT,
           typename TransformOpT = ::cuda::std::identity,
           typename PolicyHub    = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
           typename KernelSource = detail::reduce::DeviceReduceKernelSource<
-            detail::reduce::arch_policies_from_hub<PolicyHub>,
+            detail::reduce::policy_selector_from_hub<PolicyHub>,
             InputIteratorT,
             OutputIteratorT,
             OffsetT,
@@ -594,7 +594,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   InitT init,
   cudaStream_t stream,
   TransformOpT transform_op,
-  reduce_arch_policy active_policy,
+  reduce_policy active_policy,
   KernelSource kernel_source,
   KernelLauncherFactory launcher_factory)
 {
@@ -606,10 +606,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   }
 
   // Init regular kernel configuration
-  const auto tile_size = active_policy.reduce_policy.block_threads * active_policy.reduce_policy.items_per_thread;
+  const auto tile_size = active_policy.reduce.block_threads * active_policy.reduce.items_per_thread;
   int sm_occupancy;
   if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy(
-        sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce_policy.block_threads)))
+        sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce.block_threads)))
   {
     return error;
   }
@@ -652,14 +652,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
           "per thread, %d SM occupancy\n",
           (unsigned long) reduce_grid_size,
-          active_policy.reduce_policy.block_threads,
+          active_policy.reduce.block_threads,
           (long long) stream,
-          active_policy.reduce_policy.items_per_thread,
+          active_policy.reduce.items_per_thread,
           sm_occupancy);
 #endif // CUB_DEBUG_LOG
 
   // Invoke DeviceReduceKernel
-  launcher_factory(reduce_grid_size, active_policy.reduce_policy.block_threads, 0, stream)
+  launcher_factory(reduce_grid_size, active_policy.reduce.block_threads, 0, stream)
     .doit(kernel_source.ReductionKernel(), d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op);
 
   // Check for failure to launch
@@ -678,13 +678,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
 #ifdef CUB_DEBUG_LOG
   _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
           "%d items per thread\n",
-          active_policy.single_tile_policy.block_threads,
+          active_policy.single_tile.block_threads,
           (long long) stream,
-          active_policy.single_tile_policy.items_per_thread);
+          active_policy.single_tile.items_per_thread);
 #endif // CUB_DEBUG_LOG
 
   // Invoke DeviceReduceSingleTileKernel
-  launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
+  launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
     .doit(kernel_source.SingleTileSecondKernel(),
           d_block_reductions,
           d_out,
@@ -736,12 +736,12 @@ template <
   typename TransformOpT = ::cuda::std::identity,
   typename AccumT =
     decltype(select_accum_t<InputIteratorT, InitT, ReductionOpT, TransformOpT>(static_cast<OverrideAccumT*>(nullptr))),
-  typename ArchPolicies = arch_policies_from_types<AccumT, OffsetT, ReductionOpT>,
+  typename PolicySelector = policy_selector_from_types<AccumT, OffsetT, ReductionOpT>,
   typename KernelSource =
-    DeviceReduceKernelSource<ArchPolicies, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
+    DeviceReduceKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
   typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   void* d_temp_storage,
@@ -753,7 +753,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   InitT init,
   cudaStream_t stream,
   TransformOpT transform_op              = {},
-  ArchPolicies arch_policies             = {},
+  PolicySelector policy_selector         = {},
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
@@ -764,16 +764,16 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
     return error;
   }
 
-  const reduce_arch_policy active_policy = arch_policies(arch_id);
+  const reduce_policy active_policy = policy_selector(arch_id);
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
   NV_IF_TARGET(NV_IS_HOST,
                (std::stringstream ss; ss << active_policy;
                 _CubLog("Dispatching DeviceReduce to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
 #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
   // Check for small, single tile size
-  if (num_items <= static_cast<OffsetT>(
-        active_policy.single_tile_policy.block_threads * active_policy.single_tile_policy.items_per_thread))
+  if (num_items
+      <= static_cast<OffsetT>(active_policy.single_tile.block_threads * active_policy.single_tile.items_per_thread))
   {
     // Return if the caller is simply requesting the size of the storage allocation
     if (d_temp_storage == nullptr)
@@ -786,13 +786,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #ifdef CUB_DEBUG_LOG
     _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
             "%d items per thread\n",
-            active_policy.single_tile_policy.block_threads,
+            active_policy.single_tile.block_threads,
             (long long) stream,
-            active_policy.single_tile_policy.items_per_thread);
+            active_policy.single_tile.items_per_thread);
 #endif // CUB_DEBUG_LOG
 
     // Invoke single_reduce_sweep_kernel
-    launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
+    launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
       .doit(kernel_source.SingleTileKernel(), d_in, d_out, num_items, reduction_op, init, transform_op);
 
     // Check for failure to launch