From 8f191bc6682ac94d05597ae999a618a70390f118 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 7 Jan 2026 14:54:28 +0100
Subject: [PATCH 1/3] Rename new tuning API policies

arch policies -> policy selector
Drop _policy in members
reduce_arch_policy -> reduce_policy
Drop and adapt some comments
---
 c/parallel/src/reduce.cu                      | 20 +++----
 cub/benchmarks/bench/reduce/base.cuh          | 10 ++--
 .../bench/reduce/nondeterministic.cu          | 10 ++--
 cub/benchmarks/bench/transform_reduce/sum.cu  |  8 +--
 cub/cub/device/device_reduce.cuh              |  4 +-
 cub/cub/device/dispatch/dispatch_reduce.cuh   | 58 +++++++++----------
 .../dispatch_reduce_nondeterministic.cuh      | 38 ++++++------
 .../device/dispatch/kernels/kernel_reduce.cuh | 44 +++++++-------
 .../device/dispatch/tuning/tuning_reduce.cuh  | 49 +++++++---------
 cub/test/catch2_test_device_reduce_env.cu     | 14 ++---
 10 files changed, 125 insertions(+), 130 deletions(-)
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index 055455f982a..0a70c82893d 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -223,7 +223,7 @@ CUresult cccl_device_reduce_build_ex(
 
     const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
-    const auto cub_arch_policies = [&] {
+    const auto policy_sel = [&] {
       using namespace cub::detail::reduce;
 
       auto accum_type = accum_type::other;
@@ -251,15 +251,15 @@ CUresult cccl_device_reduce_build_ex(
       }
 
       const int offset_size = int{sizeof(OffsetT)};
-      return arch_policies{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
+      return policy_selector{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
     }();
 
     // TODO(bgruber): drop this if tuning policies become formattable
-    std::stringstream cub_arch_policies_str;
-    cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+    std::stringstream policy_sel_str;
+    policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
 
     auto policy_hub_expr =
-      std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
+      std::format("cub::detail::reduce::policy_selector_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
 
     std::string final_src = std::format(
       R"XXX(
@@ -284,7 +284,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
       output_iterator_src, // 4
       op_src, // 5
       policy_hub_expr, // 6
-      cub_arch_policies_str.view()); // 7
+      policy_sel_str.view()); // 7
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -371,7 +371,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
     build->cubin_size       = result.size;
     build->accumulator_size = accum_t.size;
     build->determinism      = determinism;
-    build->runtime_policy   = new cub::detail::reduce::arch_policies{cub_arch_policies};
+    build->runtime_policy   = new cub::detail::reduce::policy_selector{policy_sel};
   }
   catch (const std::exception& exc)
   {
@@ -423,7 +423,7 @@ CUresult cccl_device_reduce(
       indirect_arg_t{init},
       stream,
       ::cuda::std::identity{},
-      *static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
+      *static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
       reduce::reduce_kernel_source{build},
       cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
 
@@ -478,7 +478,7 @@ CUresult cccl_device_reduce_nondeterministic(
       indirect_arg_t{init},
       stream,
       ::cuda::std::identity{},
-      *static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
+      *static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
       reduce::reduce_kernel_source{build},
       cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
 
@@ -512,7 +512,7 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr
 
     using namespace cub::detail::reduce;
     std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
-    std::unique_ptr<arch_policies> policy(static_cast<arch_policies*>(build_ptr->runtime_policy));
+    std::unique_ptr<policy_selector> policy(static_cast<policy_selector*>(build_ptr->runtime_policy));
     check(cuLibraryUnload(build_ptr->library));
   }
   catch (const std::exception& exc)
diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh
index a6ca8f354ec..b4f431d6716 100644
--- a/cub/benchmarks/bench/reduce/base.cuh
+++ b/cub/benchmarks/bench/reduce/base.cuh
@@ -11,9 +11,9 @@
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -60,7 +60,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       ::cuda::std::execution::prop{
         ::cuda::execution::__get_tuning_t,
         ::cuda::std::execution::env{
-          ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}}
+          ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_selector{}}}}
 #  endif
     };
     static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
@@ -84,7 +84,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op
 #if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #endif
   );
 
@@ -104,7 +104,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op
 #if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #endif
     );
   });
diff --git a/cub/benchmarks/bench/reduce/nondeterministic.cu b/cub/benchmarks/bench/reduce/nondeterministic.cu
index 11c01df26e4..fca2faa4599 100644
--- a/cub/benchmarks/bench/reduce/nondeterministic.cu
+++ b/cub/benchmarks/bench/reduce/nondeterministic.cu
@@ -13,9 +13,9 @@
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -24,7 +24,7 @@ struct arch_policies
       1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
       cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
       cub::LOAD_DEFAULT};
-    return {{}, {}, {}, policy}; // Only reduce_nondeterministic_policy is used
+    return {{}, {}, {}, policy}; // Only reduce_nondeterministic is used
   }
 };
 #endif // !TUNE_BASE
@@ -66,7 +66,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op
 #if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #endif
   );
 
@@ -86,7 +86,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op
 #if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #endif
     );
   });
diff --git a/cub/benchmarks/bench/transform_reduce/sum.cu b/cub/benchmarks/bench/transform_reduce/sum.cu
index ce88405878f..c5f543e0c25 100644
--- a/cub/benchmarks/bench/transform_reduce/sum.cu
+++ b/cub/benchmarks/bench/transform_reduce/sum.cu
@@ -14,9 +14,9 @@
 #endif
 
 #if !TUNE_BASE
-struct arch_policies
+struct policy_selector
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
   {
     const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
     const auto policy           = cub::agent_reduce_policy{
@@ -133,7 +133,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
     transform_op_t{}
 #  if !TUNE_BASE
     ,
-    arch_policies{}
+    policy_selector{}
 #  endif
   );
 
@@ -153,7 +153,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
       transform_op_t{}
 #  if !TUNE_BASE
       ,
-      arch_policies{}
+      policy_selector{}
 #  endif
     );
   });
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 6d9470292b9..f5b1101983a 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -147,7 +147,7 @@ private:
     using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
       TuningEnvT,
       detail::reduce::get_tuning_query_t,
-      detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
+      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
 
     return detail::reduce::dispatch<accum_t>(
       d_temp_storage,
@@ -221,7 +221,7 @@ private:
     using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
       TuningEnvT,
       detail::reduce::get_tuning_query_t,
-      detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
+      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
 
     return detail::reduce::dispatch_nondeterministic<accum_t>(
       d_temp_storage,
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index fe84212f11d..619a967f290 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::reduce
 {
-template <typename ArchPolicies,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename OffsetT,
@@ -55,12 +55,12 @@ template <typename ArchPolicies,
           typename TransformOpT>
 struct DeviceReduceKernelSource
 {
-  // ArchPolicies must be stateless, so we can pass the type to the kernel
-  static_assert(::cuda::std::is_empty_v<ArchPolicies>);
+  // PolicySelector must be stateless, so we can pass the type to the kernel
+  static_assert(::cuda::std::is_empty_v<PolicySelector>);
 
   CUB_DEFINE_KERNEL_GETTER(
     SingleTileKernel,
-    DeviceReduceSingleTileKernel<ArchPolicies,
+    DeviceReduceSingleTileKernel<PolicySelector,
                                  InputIteratorT,
                                  OutputIteratorT,
                                  OffsetT,
@@ -70,11 +70,11 @@ struct DeviceReduceKernelSource
                                  TransformOpT>)
 
   CUB_DEFINE_KERNEL_GETTER(
-    ReductionKernel, DeviceReduceKernel<ArchPolicies, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
+    ReductionKernel, DeviceReduceKernel<PolicySelector, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
 
   CUB_DEFINE_KERNEL_GETTER(
     SingleTileSecondKernel,
-    DeviceReduceSingleTileKernel<ArchPolicies,
+    DeviceReduceSingleTileKernel<PolicySelector,
                                  AccumT*,
                                  OutputIteratorT,
                                  int, // Always used with int offsets
@@ -90,15 +90,15 @@ struct DeviceReduceKernelSource
 
 // TODO(bgruber): remove in CCCL 4.0
 template <typename PolicyHub>
-struct arch_policies_from_hub
+struct policy_selector_from_hub
 {
   // this is only called in device code, so we can ignore the arch parameter
-  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_arch_policy
+  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_policy
   {
     using ap             = typename PolicyHub::MaxPolicy::ActivePolicy;
     using ap_reduce      = typename ap::ReducePolicy;
     using ap_single_tile = typename ap::SingleTilePolicy;
-    return reduce_arch_policy{
+    return reduce_policy{
       agent_reduce_policy{
         ap_reduce::BLOCK_THREADS,
         ap_reduce::ITEMS_PER_THREAD,
@@ -153,7 +153,7 @@ template <typename InputIteratorT,
           typename TransformOpT = ::cuda::std::identity,
           typename PolicyHub    = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
           typename KernelSource = detail::reduce::DeviceReduceKernelSource<
-            detail::reduce::arch_policies_from_hub<PolicyHub>,
+            detail::reduce::policy_selector_from_hub<PolicyHub>,
             InputIteratorT,
             OutputIteratorT,
             OffsetT,
@@ -594,7 +594,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   InitT init,
   cudaStream_t stream,
   TransformOpT transform_op,
-  reduce_arch_policy active_policy,
+  reduce_policy active_policy,
   KernelSource kernel_source,
   KernelLauncherFactory launcher_factory)
 {
@@ -606,10 +606,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   }
 
   // Init regular kernel configuration
-  const auto tile_size = active_policy.reduce_policy.block_threads * active_policy.reduce_policy.items_per_thread;
+  const auto tile_size = active_policy.reduce.block_threads * active_policy.reduce.items_per_thread;
   int sm_occupancy;
   if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy(
-        sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce_policy.block_threads)))
+        sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce.block_threads)))
   {
     return error;
   }
@@ -652,14 +652,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
           "per thread, %d SM occupancy\n",
           (unsigned long) reduce_grid_size,
-          active_policy.reduce_policy.block_threads,
+          active_policy.reduce.block_threads,
           (long long) stream,
-          active_policy.reduce_policy.items_per_thread,
+          active_policy.reduce.items_per_thread,
           sm_occupancy);
 #endif // CUB_DEBUG_LOG
 
   // Invoke DeviceReduceKernel
-  launcher_factory(reduce_grid_size, active_policy.reduce_policy.block_threads, 0, stream)
+  launcher_factory(reduce_grid_size, active_policy.reduce.block_threads, 0, stream)
     .doit(kernel_source.ReductionKernel(), d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op);
 
   // Check for failure to launch
@@ -678,13 +678,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
 #ifdef CUB_DEBUG_LOG
   _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
           "%d items per thread\n",
-          active_policy.single_tile_policy.block_threads,
+          active_policy.single_tile.block_threads,
           (long long) stream,
-          active_policy.single_tile_policy.items_per_thread);
+          active_policy.single_tile.items_per_thread);
 #endif // CUB_DEBUG_LOG
 
   // Invoke DeviceReduceSingleTileKernel
-  launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
+  launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
     .doit(kernel_source.SingleTileSecondKernel(),
           d_block_reductions,
           d_out,
@@ -736,12 +736,12 @@ template <
   typename TransformOpT = ::cuda::std::identity,
   typename AccumT =
     decltype(select_accum_t<InputIteratorT, InitT, ReductionOpT, TransformOpT>(static_cast<OverrideAccumT*>(nullptr))),
-  typename ArchPolicies = arch_policies_from_types<AccumT, OffsetT, ReductionOpT>,
+  typename PolicySelector = policy_selector_from_types<AccumT, OffsetT, ReductionOpT>,
   typename KernelSource =
-    DeviceReduceKernelSource<ArchPolicies, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
+    DeviceReduceKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
   typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   void* d_temp_storage,
@@ -753,7 +753,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   InitT init,
   cudaStream_t stream,
   TransformOpT transform_op              = {},
-  ArchPolicies arch_policies             = {},
+  PolicySelector policy_selector         = {},
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
@@ -764,7 +764,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
     return error;
   }
 
-  const reduce_arch_policy active_policy = arch_policies(arch_id);
+  const reduce_policy active_policy = policy_selector(arch_id);
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
   NV_IF_TARGET(NV_IS_HOST,
                (std::stringstream ss; ss << active_policy;
@@ -772,8 +772,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
   // Check for small, single tile size
-  if (num_items <= static_cast<OffsetT>(
-        active_policy.single_tile_policy.block_threads * active_policy.single_tile_policy.items_per_thread))
+  if (num_items
+      <= static_cast<OffsetT>(active_policy.single_tile.block_threads * active_policy.single_tile.items_per_thread))
   {
     // Return if the caller is simply requesting the size of the storage allocation
     if (d_temp_storage == nullptr)
@@ -786,13 +786,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #ifdef CUB_DEBUG_LOG
     _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
             "%d items per thread\n",
-            active_policy.single_tile_policy.block_threads,
+            active_policy.single_tile.block_threads,
             (long long) stream,
-            active_policy.single_tile_policy.items_per_thread);
+            active_policy.single_tile.items_per_thread);
 #endif // CUB_DEBUG_LOG
 
     // Invoke single_reduce_sweep_kernel
-    launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
+    launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
       .doit(kernel_source.SingleTileKernel(), d_in, d_out, num_items, reduction_op, init, transform_op);
 
     // Check for failure to launch
diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
index 61bcb7c9810..9bc8f654579 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
@@ -148,19 +148,19 @@ template <typename OverrideAccumT = nondeterministic_no_override,
           typename TransformOpT = ::cuda::std::identity,
           typename AccumT = decltype(select_nondeterministic_accum_t<InputIteratorT, InitT, ReductionOpT, TransformOpT>(
             static_cast<OverrideAccumT*>(nullptr))),
-          typename ArchPolicies = arch_policies_from_types<AccumT, OffsetT, ReductionOpT>,
-          typename KernelSource = DeviceReduceNondeterministicKernelSource<
-            ArchPolicies,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT,
-            InitT,
-            AccumT,
-            TransformOpT>,
+          typename PolicySelector = policy_selector_from_types<AccumT, OffsetT, ReductionOpT>,
+          typename KernelSource   = DeviceReduceNondeterministicKernelSource<
+              PolicySelector,
+              InputIteratorT,
+              OutputIteratorT,
+              OffsetT,
+              ReductionOpT,
+              InitT,
+              AccumT,
+              TransformOpT>,
           typename KernelLauncherFactory = TripleChevronFactory>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic(
   void* d_temp_storage,
@@ -172,7 +172,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic(
   InitT init,
   cudaStream_t stream,
   TransformOpT transform_op              = {},
-  ArchPolicies arch_policies             = {},
+  PolicySelector policy_selector         = {},
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
@@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic(
     return error;
   }
 
-  const reduce_arch_policy active_policy = arch_policies(arch_id);
+  const reduce_policy active_policy = policy_selector(arch_id);
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
   NV_IF_TARGET(
     NV_IS_HOST,
@@ -199,13 +199,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic(
   }
 
   // Init kernel configuration
-  const int tile_size = active_policy.reduce_nondeterministic_policy.block_threads
-                      * active_policy.reduce_nondeterministic_policy.items_per_thread;
+  const int tile_size =
+    active_policy.reduce_nondeterministic.block_threads * active_policy.reduce_nondeterministic.items_per_thread;
   int sm_occupancy;
   if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy(
         sm_occupancy,
         kernel_source.NondeterministicAtomicKernel(),
-        active_policy.reduce_nondeterministic_policy.block_threads)))
+        active_policy.reduce_nondeterministic.block_threads)))
   {
     return error;
   }
@@ -232,14 +232,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic(
   _CubLog("Invoking NondeterministicDeviceReduceAtomicKernel<<<%llu, %d, 0, %p>>>(), %d items "
           "per thread, %d SM occupancy\n",
           (unsigned long long) reduce_grid_size,
-          active_policy.reduce_nondeterministic_policy.block_threads,
+          active_policy.reduce_nondeterministic.block_threads,
           (void*) stream,
-          active_policy.reduce_nondeterministic_policy.items_per_thread,
+          active_policy.reduce_nondeterministic.items_per_thread,
           sm_occupancy);
 #endif // CUB_DEBUG_LOG
 
   // Invoke NondeterministicDeviceReduceAtomicKernel
-  launcher_factory(reduce_grid_size, active_policy.reduce_nondeterministic_policy.block_threads, 0, stream)
+  launcher_factory(reduce_grid_size, active_policy.reduce_nondeterministic.block_threads, 0, stream)
     .doit(
       kernel_source.NondeterministicAtomicKernel(), d_in, d_out, num_items, even_share, reduction_op, init, transform_op);
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 97e413ad349..053aaf21298 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -88,7 +88,7 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_
  * @brief Reduce region kernel entry point (multi-block). Computes privatized
  *        reductions, one per thread block.
  *
- * @tparam ArchPolicies
+ * @tparam PolicySelector
  *   The tuning polices
  *
  * @tparam InputIteratorT
@@ -123,25 +123,25 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_
  * @param[in] reduction_op
  *   Binary reduction functor
  */
-template <typename ArchPolicies,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OffsetT,
           typename ReductionOpT,
           typename AccumT,
           typename TransformOpT>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
-CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
-  ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
-    .reduce_policy.block_threads)) void DeviceReduceKernel(InputIteratorT d_in,
-                                                           AccumT* d_out,
-                                                           OffsetT num_items,
-                                                           GridEvenShare<OffsetT> even_share,
-                                                           ReductionOpT reduction_op,
-                                                           TransformOpT transform_op)
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce.block_threads)) void DeviceReduceKernel(
+  InputIteratorT d_in,
+  AccumT* d_out,
+  OffsetT num_items,
+  GridEvenShare<OffsetT> even_share,
+  ReductionOpT reduction_op,
+  TransformOpT transform_op)
 {
-  static constexpr agent_reduce_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_policy;
+  static constexpr agent_reduce_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce;
   // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20
   using agent_policy_t =
     AgentReducePolicy<policy.block_threads,
@@ -177,7 +177,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
  *        to aggregate privatized thread block reductions from a previous
  *        multi-block reduction pass.
  *
- * @tparam ArchPolicies
+ * @tparam PolicySelector
  *   The tuning polices
  *
  * @tparam InputIteratorT
@@ -214,7 +214,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
  * @param[in] init
  *   The initial value of the reduction
  */
-template <typename ArchPolicies,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename OffsetT,
@@ -223,10 +223,10 @@ template <typename ArchPolicies,
           typename AccumT,
           typename TransformOpT = ::cuda::std::identity>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
-  int(ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile_policy.block_threads),
+  int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads),
   1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
                                        OutputIteratorT d_out,
                                        OffsetT num_items,
@@ -234,7 +234,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
                                        InitT init,
                                        TransformOpT transform_op)
 {
-  static constexpr agent_reduce_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile_policy;
+  static constexpr agent_reduce_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile;
   // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20
   using agent_policy_t =
     AgentReducePolicy<policy.block_threads,
@@ -506,7 +506,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   }
 }
 
-template <typename ArchPolicies,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename OffsetT,
@@ -515,11 +515,11 @@ template <typename ArchPolicies,
           typename InitT,
           typename TransformOpT>
 #if _CCCL_HAS_CONCEPTS()
-  requires reduce_policy_hub<ArchPolicies>
+  requires reduce_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
-  ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
-    .reduce_nondeterministic_policy
+  PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
+    .reduce_nondeterministic
     .block_threads)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in,
                                                                    OutputIteratorT d_out,
                                                                    OffsetT num_items,
@@ -549,7 +549,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
 
   // Thread block type for reducing input tiles
   static constexpr agent_reduce_policy policy =
-    ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_nondeterministic_policy;
+    PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_nondeterministic;
   // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20
   using agent_policy_t =
     AgentReducePolicy<policy.block_threads,
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
index 5cf4f6ed375..b77f168638c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
@@ -64,32 +64,30 @@ struct agent_reduce_policy // equivalent of AgentReducePolicy
 #endif // !_CCCL_COMPILER(NVRTC)
 };
 
-// TODO(bgruber): bikeshed name before we make the tuning API public
-struct reduce_arch_policy // equivalent of a policy for a single CUDA architecture
+struct reduce_policy
 {
-  agent_reduce_policy reduce_policy;
-  agent_reduce_policy single_tile_policy;
-  agent_reduce_policy segmented_reduce_policy;
-  agent_reduce_policy reduce_nondeterministic_policy;
+  agent_reduce_policy reduce;
+  agent_reduce_policy single_tile;
+  agent_reduce_policy segmented_reduce;
+  agent_reduce_policy reduce_nondeterministic;
 
-  _CCCL_API constexpr friend bool operator==(const reduce_arch_policy& lhs, const reduce_arch_policy& rhs)
+  _CCCL_API constexpr friend bool operator==(const reduce_policy& lhs, const reduce_policy& rhs)
   {
-    return lhs.reduce_policy == rhs.reduce_policy && lhs.single_tile_policy == rhs.single_tile_policy
-        && lhs.segmented_reduce_policy == rhs.segmented_reduce_policy
-        && lhs.reduce_nondeterministic_policy == rhs.reduce_nondeterministic_policy;
+    return lhs.reduce == rhs.reduce && lhs.single_tile == rhs.single_tile
+        && lhs.segmented_reduce == rhs.segmented_reduce && lhs.reduce_nondeterministic == rhs.reduce_nondeterministic;
   }
 
-  _CCCL_API constexpr friend bool operator!=(const reduce_arch_policy& lhs, const reduce_arch_policy& rhs)
+  _CCCL_API constexpr friend bool operator!=(const reduce_policy& lhs, const reduce_policy& rhs)
   {
     return !(lhs == rhs);
   }
 
 #if !_CCCL_COMPILER(NVRTC)
-  friend ::std::ostream& operator<<(::std::ostream& os, const reduce_arch_policy& p)
+  friend ::std::ostream& operator<<(::std::ostream& os, const reduce_policy& p)
   {
-    return os << "reduce_arch_policy { .reduce_policy = " << p.reduce_policy << ", .single_tile_policy = "
-              << p.single_tile_policy << ", .segmented_reduce_policy = " << p.segmented_reduce_policy
-              << ", .reduce_nondeterministic_policy = " << p.reduce_nondeterministic_policy << " }";
+    return os
+        << "reduce_policy { .reduce = " << p.reduce << ", .single_tile = " << p.single_tile << ", .segmented_reduce = "
+        << p.segmented_reduce << ", .reduce_nondeterministic = " << p.reduce_nondeterministic << " }";
   }
 #endif // !_CCCL_COMPILER(NVRTC)
 };
@@ -97,10 +95,10 @@ struct reduce_arch_policy // equivalent of a policy for a single CUDA architectu
 #if _CCCL_HAS_CONCEPTS()
 _CCCL_API consteval void __needs_a_constexpr_value(auto) {}
 
-// TODO(bgruber): bikeshed name before we make the tuning API public
+// TODO(bgruber): replace this by a more general concept
 template <typename T>
-concept reduce_policy_hub = requires(T hub, ::cuda::arch_id arch) {
-  { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as<reduce_arch_policy>;
+concept reduce_policy_selector = requires(T hub, ::cuda::arch_id arch) {
+  { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as<reduce_policy>;
   { __needs_a_constexpr_value(hub(arch)) };
 };
 #endif // _CCCL_HAS_CONCEPTS()
@@ -419,17 +417,14 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
 
-struct arch_policies // equivalent to the policy_hub, holds policies for a bunch of CUDA architectures
+struct policy_selector
 {
   accum_type accum_t; // TODO(bgruber): accum_type should become some CCCL global enum
   op_type operation_t; // TODO(bgruber): op_type should become some CCCL global enum
   int offset_size;
   int accum_size;
 
-  // IDEA(bgruber): instead of the constexpr function, we could also provide a map<int, reduce_arch_policy> and move the
-  // selection mechanism elsewhere
-
-  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_arch_policy
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_policy
   {
     // if we don't have a tuning for sm100, fall through
     auto sm100_tuning = get_sm100_tuning(accum_t, operation_t, offset_size, accum_size);
@@ -478,16 +473,16 @@ struct arch_policies // equivalent to the policy_hub, holds policies for a bunch
 };
 
 #if _CCCL_HAS_CONCEPTS()
-static_assert(reduce_policy_hub<arch_policies>);
+static_assert(reduce_policy_selector<policy_selector>);
 #endif // _CCCL_HAS_CONCEPTS()
 
 // stateless version which can be passed to kernels
 template <typename AccumT, typename OffsetT, typename ReductionOpT>
-struct arch_policies_from_types
+struct policy_selector_from_types
 {
-  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_arch_policy
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_policy
   {
-    constexpr auto policies = arch_policies{
+    constexpr auto policies = policy_selector{
       classify_accum_type<AccumT>(), classify_op<ReductionOpT>(), int{sizeof(OffsetT)}, int{sizeof(AccumT)}};
     return policies(arch);
   }
diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu
index cd39f4ca14e..cb2e390b19d 100644
--- a/cub/test/catch2_test_device_reduce_env.cu
+++ b/cub/test/catch2_test_device_reduce_env.cu
@@ -67,8 +67,8 @@ TEST_CASE("Device reduce works with default environment", "[reduce][device]")
   REQUIRE(cudaSuccess == cub::detail::ptx_arch_id(arch_id, current_device));
 
   int target_block_size =
-    cub::detail::reduce::arch_policies_from_types<value_t, offset_t, block_size_check_t>{}(arch_id)
-      .single_tile_policy.block_threads;
+    cub::detail::reduce::policy_selector_from_types<value_t, offset_t, block_size_check_t>{}(arch_id)
+      .single_tile.block_threads;
 
   num_items_t num_items = 1;
   c2h::device_vector<int> d_block_size(1);
@@ -107,7 +107,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]")
 template <int BlockThreads>
 struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
 {
-  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_arch_policy
+  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy
   {
     const auto policy = cub::detail::reduce::agent_reduce_policy{
       BlockThreads, 1, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_DEFAULT};
@@ -197,7 +197,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements)
         cudaSuccess
         == cub::DeviceReduce::Reduce(nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items, op_t{}, init));
 
-      using policy_t = cub::detail::reduce::arch_policies_from_types<accumulator_t, offset_t, op_t>;
+      using policy_t = cub::detail::reduce::policy_selector_from_types<accumulator_t, offset_t, op_t>;
       return cuda::std::array<void*, 3>{
         reinterpret_cast<void*>(
           cub::detail::reduce::DeviceReduceSingleTileKernel<
@@ -223,7 +223,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements)
     }
     else if constexpr (cub::detail::is_non_deterministic_v<determinism_t>)
     {
-      using policy_t = cub::detail::reduce::arch_policies_from_types<accumulator_t, offset_t, op_t>;
+      using policy_t = cub::detail::reduce::policy_selector_from_types<accumulator_t, offset_t, op_t>;
       auto* raw_ptr  = thrust::raw_pointer_cast(d_out.data());
 
       REQUIRE(
@@ -316,7 +316,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements)
     {
       REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items));
 
-      using policy_t = cub::detail::reduce::arch_policies_from_types<accumulator_t, offset_t, op_t>;
+      using policy_t = cub::detail::reduce::policy_selector_from_types<accumulator_t, offset_t, op_t>;
       return cuda::std::array<void*, 3>{
         reinterpret_cast<void*>(
           cub::detail::reduce::DeviceReduceSingleTileKernel<
@@ -342,7 +342,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements)
     }
     else if constexpr (cub::detail::is_non_deterministic_v<determinism_t>)
     {
-      using policy_t = cub::detail::reduce::arch_policies_from_types<accumulator_t, offset_t, op_t>;
+      using policy_t = cub::detail::reduce::policy_selector_from_types<accumulator_t, offset_t, op_t>;
       auto* raw_ptr  = thrust::raw_pointer_cast(d_out.data());
 
       REQUIRE(

From a43a607633358f2ecba8a4af990684eb07587276 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 9 Jan 2026 23:46:24 +0100
Subject: [PATCH 2/3] Add /diagnostics:caret

---
 cmake/CCCLBuildCompilerTargets.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/CCCLBuildCompilerTargets.cmake b/cmake/CCCLBuildCompilerTargets.cmake
index dd0aae8dcc5..302efc6bbe1 100644
--- a/cmake/CCCLBuildCompilerTargets.cmake
+++ b/cmake/CCCLBuildCompilerTargets.cmake
@@ -154,6 +154,9 @@ function(cccl_build_compiler_targets)
     # See https://github.com/microsoft/STL/issues/696
     append_option_if_available("/wd4494" cxx_compile_options)
 
+    # Get error messages with a little arrow indicating the error location more exactly
+    append_option_if_available("/diagnostics:caret" cxx_compile_options)
+
     if (MSVC_TOOLSET_VERSION LESS 143)
       # winbase.h(9572): warning C5105: macro expansion producing 'defined' has undefined behavior
       append_option_if_available("/wd5105" cxx_compile_options)

From e6dcc837467cca69af522fc713f444b2aa6308b4 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 9 Jan 2026 23:49:57 +0100
Subject: [PATCH 3/3] Add extra parentheses to please MSVC

---
 libcudacxx/include/cuda/std/__complex/roots.h | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__complex/roots.h b/libcudacxx/include/cuda/std/__complex/roots.h
index 37a4f012ee7..17e5286b256 100644
--- a/libcudacxx/include/cuda/std/__complex/roots.h
+++ b/libcudacxx/include/cuda/std/__complex/roots.h
@@ -76,26 +76,20 @@ template <class _Tp>
   // pre-check to see if we over/underflow:
   _Tp __x_abs_sq = ::cuda::std::fma(__re, __re, __im * __im);
 
-  // NVCC 12.9 seems to be eliminating some parentheses which makes MSVC unhappy.
-  _CCCL_DIAG_PUSH
-#if _CCCL_CUDA_COMPILER(NVCC, <, 13, 0)
-  _CCCL_DIAG_SUPPRESS_MSVC(4554) // warning C4554: '<<': check operator precedence for possible error; use parentheses
-                                 // to clarify precedence
-#endif // _CCCL_CUDA_COMPILER(NVCC, <, 13, 0)
-
   // Get some bounds where __re +- |__x| won't overflow.
   // Doesn't need to be too exact, enough to cover extremal cases.
   // overflow bound = sqrt(MAX_FLOAT / 2)
   // underflow bound similar, but tweaked to allow for normalizing denormal calculation.
+  // The static_casts have extra parentheses around them to avoid MSVC's:
+  //   warning C4554: '<<': check operator precedence for possible error; use parentheses to clarify precedence
   constexpr __uint_t __overflow_bound_exp =
-    (static_cast<__uint_t>((static_cast<__uint_t>(__max_exponent - 1) >> 1) + __exp_bias) << __mant_nbits)
+    ((static_cast<__uint_t>(((static_cast<__uint_t>(__max_exponent - 1)) >> 1) + __exp_bias)) << __mant_nbits)
     | __fp_explicit_bit_mask_of_v<_Tp>;
   constexpr __uint_t __underflow_bound_exp =
-    (static_cast<__uint_t>((static_cast<__uint_t>(-__max_exponent + __mant_nbits) >> 1) + __exp_bias) << __mant_nbits)
+    ((static_cast<__uint_t>(((static_cast<__uint_t>(-__max_exponent + __mant_nbits)) >> 1) + __exp_bias))
+     << __mant_nbits)
     | __fp_explicit_bit_mask_of_v<_Tp>;
 
-  _CCCL_DIAG_POP
-
   _Tp __overflow_bound  = ::cuda::std::__fp_from_storage<_Tp>(__overflow_bound_exp);
   _Tp __underflow_bound = ::cuda::std::__fp_from_storage<_Tp>(__underflow_bound_exp);