From 8f191bc6682ac94d05597ae999a618a70390f118 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 7 Jan 2026 14:54:28 +0100 Subject: [PATCH 1/3] Rename new tuning API policies arch policies -> policy selector Drop _policy in members reduce_arch_policy -> reduce_policy Drop and adapt some comments --- c/parallel/src/reduce.cu | 20 +++---- cub/benchmarks/bench/reduce/base.cuh | 10 ++-- .../bench/reduce/nondeterministic.cu | 10 ++-- cub/benchmarks/bench/transform_reduce/sum.cu | 8 +-- cub/cub/device/device_reduce.cuh | 4 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 58 +++++++++---------- .../dispatch_reduce_nondeterministic.cuh | 38 ++++++------ .../device/dispatch/kernels/kernel_reduce.cuh | 44 +++++++------- .../device/dispatch/tuning/tuning_reduce.cuh | 49 +++++++--------- cub/test/catch2_test_device_reduce_env.cu | 14 ++--- 10 files changed, 125 insertions(+), 130 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 055455f982a..0a70c82893d 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -223,7 +223,7 @@ CUresult cccl_device_reduce_build_ex( const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64); - const auto cub_arch_policies = [&] { + const auto policy_sel = [&] { using namespace cub::detail::reduce; auto accum_type = accum_type::other; @@ -251,15 +251,15 @@ CUresult cccl_device_reduce_build_ex( } const int offset_size = int{sizeof(OffsetT)}; - return arch_policies{accum_type, operation_t, offset_size, static_cast(accum_t.size)}; + return policy_selector{accum_type, operation_t, offset_size, static_cast(accum_t.size)}; }(); // TODO(bgruber): drop this if tuning policies become formattable - std::stringstream cub_arch_policies_str; - cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor})); + std::stringstream policy_sel_str; + policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor})); auto policy_hub_expr = - std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name); + std::format("cub::detail::reduce::policy_selector_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name); std::string final_src = std::format( R"XXX( @@ -284,7 +284,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7 output_iterator_src, // 4 op_src, // 5 policy_hub_expr, // 6 - cub_arch_policies_str.view()); // 7 + policy_sel_str.view()); // 7 #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); @@ -371,7 +371,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7 build->cubin_size = result.size; build->accumulator_size = accum_t.size; build->determinism = determinism; - build->runtime_policy = new cub::detail::reduce::arch_policies{cub_arch_policies}; + build->runtime_policy = new cub::detail::reduce::policy_selector{policy_sel}; } catch (const std::exception& exc) { @@ -423,7 +423,7 @@ CUresult cccl_device_reduce( indirect_arg_t{init}, stream, ::cuda::std::identity{}, - *static_cast(build.runtime_policy), + *static_cast(build.runtime_policy), reduce::reduce_kernel_source{build}, cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); @@ -478,7 +478,7 @@ CUresult cccl_device_reduce_nondeterministic( indirect_arg_t{init}, stream, ::cuda::std::identity{}, - *static_cast(build.runtime_policy), + *static_cast(build.runtime_policy), reduce::reduce_kernel_source{build}, cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); @@ -512,7 +512,7 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr using namespace cub::detail::reduce; std::unique_ptr cubin(static_cast(build_ptr->cubin)); - std::unique_ptr policy(static_cast(build_ptr->runtime_policy)); + std::unique_ptr policy(static_cast(build_ptr->runtime_policy)); check(cuLibraryUnload(build_ptr->library)); } catch (const std::exception& exc) diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index a6ca8f354ec..b4f431d6716 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -11,9 +11,9 @@ #include #if !TUNE_BASE -struct arch_policies +struct policy_selector { - _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy { const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD); const auto policy = cub::agent_reduce_policy{ @@ -60,7 +60,7 @@ void reduce(nvbench::state& state, nvbench::type_list) ::cuda::std::execution::prop{ ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env{ - ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}} + ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_selector{}}}} # endif }; static_assert(::cuda::std::execution::__queryable_with); @@ -84,7 +84,7 @@ void reduce(nvbench::state& state, nvbench::type_list) transform_op #if !TUNE_BASE , - arch_policies{} + policy_selector{} #endif ); @@ -104,7 +104,7 @@ void reduce(nvbench::state& state, nvbench::type_list) transform_op #if !TUNE_BASE , - arch_policies{} + policy_selector{} #endif ); }); diff --git a/cub/benchmarks/bench/reduce/nondeterministic.cu b/cub/benchmarks/bench/reduce/nondeterministic.cu index 11c01df26e4..fca2faa4599 100644 --- a/cub/benchmarks/bench/reduce/nondeterministic.cu +++ b/cub/benchmarks/bench/reduce/nondeterministic.cu @@ -13,9 +13,9 @@ // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 #if !TUNE_BASE -struct arch_policies +struct policy_selector { - _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy { const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD); const auto policy = cub::agent_reduce_policy{ @@ -24,7 +24,7 @@ struct arch_policies 1 << TUNE_ITEMS_PER_VEC_LOAD_POW2, cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC, cub::LOAD_DEFAULT}; - return {{}, {}, {}, policy}; // Only reduce_nondeterministic_policy is used + return {{}, {}, {}, policy}; // Only reduce_nondeterministic is used } }; #endif // !TUNE_BASE @@ -66,7 +66,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list) transform_op #if !TUNE_BASE , - arch_policies{} + policy_selector{} #endif ); @@ -86,7 +86,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list) transform_op #if !TUNE_BASE , - arch_policies{} + policy_selector{} #endif ); }); diff --git a/cub/benchmarks/bench/transform_reduce/sum.cu b/cub/benchmarks/bench/transform_reduce/sum.cu index ce88405878f..c5f543e0c25 100644 --- a/cub/benchmarks/bench/transform_reduce/sum.cu +++ b/cub/benchmarks/bench/transform_reduce/sum.cu @@ -14,9 +14,9 @@ #endif #if !TUNE_BASE -struct arch_policies +struct policy_selector { - _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy { const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD); const auto policy = cub::agent_reduce_policy{ @@ -133,7 +133,7 @@ void reduce(nvbench::state& state, nvbench::type_list) transform_op_t{} # if !TUNE_BASE , - arch_policies{} + policy_selector{} # endif ); @@ -153,7 +153,7 @@ void reduce(nvbench::state& state, nvbench::type_list) transform_op_t{} # if !TUNE_BASE , - arch_policies{} + policy_selector{} # endif ); }); diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 6d9470292b9..f5b1101983a 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -147,7 +147,7 @@ private: using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t< TuningEnvT, detail::reduce::get_tuning_query_t, - detail::reduce::arch_policies_from_types>; + detail::reduce::policy_selector_from_types>; return detail::reduce::dispatch( d_temp_storage, @@ -221,7 +221,7 @@ private: using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t< TuningEnvT, detail::reduce::get_tuning_query_t, - detail::reduce::arch_policies_from_types>; + detail::reduce::policy_selector_from_types>; return detail::reduce::dispatch_nondeterministic( d_temp_storage, diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index fe84212f11d..619a967f290 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN namespace detail::reduce { -template struct DeviceReduceKernelSource { - // ArchPolicies must be stateless, so we can pass the type to the kernel - static_assert(::cuda::std::is_empty_v); + // PolicySelector must be stateless, so we can pass the type to the kernel + static_assert(::cuda::std::is_empty_v); CUB_DEFINE_KERNEL_GETTER( SingleTileKernel, - DeviceReduceSingleTileKernel) CUB_DEFINE_KERNEL_GETTER( - ReductionKernel, DeviceReduceKernel) + ReductionKernel, DeviceReduceKernel) CUB_DEFINE_KERNEL_GETTER( SingleTileSecondKernel, - DeviceReduceSingleTileKernel -struct arch_policies_from_hub +struct policy_selector_from_hub { // this is only called in device code, so we can ignore the arch parameter - _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_arch_policy + _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_policy { using ap = typename PolicyHub::MaxPolicy::ActivePolicy; using ap_reduce = typename ap::ReducePolicy; using ap_single_tile = typename ap::SingleTilePolicy; - return reduce_arch_policy{ + return reduce_policy{ agent_reduce_policy{ ap_reduce::BLOCK_THREADS, ap_reduce::ITEMS_PER_THREAD, @@ -153,7 +153,7 @@ template , typename KernelSource = detail::reduce::DeviceReduceKernelSource< - detail::reduce::arch_policies_from_hub, + detail::reduce::policy_selector_from_hub, InputIteratorT, OutputIteratorT, OffsetT, @@ -594,7 +594,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( InitT init, cudaStream_t stream, TransformOpT transform_op, - reduce_arch_policy active_policy, + reduce_policy active_policy, KernelSource kernel_source, KernelLauncherFactory launcher_factory) { @@ -606,10 +606,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( } // Init regular kernel configuration - const auto tile_size = active_policy.reduce_policy.block_threads * active_policy.reduce_policy.items_per_thread; + const auto tile_size = active_policy.reduce.block_threads * active_policy.reduce.items_per_thread; int sm_occupancy; if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy( - sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce_policy.block_threads))) + sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce.block_threads))) { return error; } @@ -652,14 +652,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", (unsigned long) reduce_grid_size, - active_policy.reduce_policy.block_threads, + active_policy.reduce.block_threads, (long long) stream, - active_policy.reduce_policy.items_per_thread, + active_policy.reduce.items_per_thread, sm_occupancy); #endif // CUB_DEBUG_LOG // Invoke DeviceReduceKernel - launcher_factory(reduce_grid_size, active_policy.reduce_policy.block_threads, 0, stream) + launcher_factory(reduce_grid_size, active_policy.reduce.block_threads, 0, stream) .doit(kernel_source.ReductionKernel(), d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op); // Check for failure to launch @@ -678,13 +678,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( #ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", - active_policy.single_tile_policy.block_threads, + active_policy.single_tile.block_threads, (long long) stream, - active_policy.single_tile_policy.items_per_thread); + active_policy.single_tile.items_per_thread); #endif // CUB_DEBUG_LOG // Invoke DeviceReduceSingleTileKernel - launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream) + launcher_factory(1, active_policy.single_tile.block_threads, 0, stream) .doit(kernel_source.SingleTileSecondKernel(), d_block_reductions, d_out, @@ -736,12 +736,12 @@ template < typename TransformOpT = ::cuda::std::identity, typename AccumT = decltype(select_accum_t(static_cast(nullptr))), - typename ArchPolicies = arch_policies_from_types, + typename PolicySelector = policy_selector_from_types, typename KernelSource = - DeviceReduceKernelSource, + DeviceReduceKernelSource, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> #if _CCCL_HAS_CONCEPTS() - requires reduce_policy_hub + requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( void* d_temp_storage, @@ -753,7 +753,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( InitT init, cudaStream_t stream, TransformOpT transform_op = {}, - ArchPolicies arch_policies = {}, + PolicySelector policy_selector = {}, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { @@ -764,7 +764,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( return error; } - const reduce_arch_policy active_policy = arch_policies(arch_id); + const reduce_policy active_policy = policy_selector(arch_id); #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET(NV_IS_HOST, (std::stringstream ss; ss << active_policy; @@ -772,8 +772,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) // Check for small, single tile size - if (num_items <= static_cast( - active_policy.single_tile_policy.block_threads * active_policy.single_tile_policy.items_per_thread)) + if (num_items + <= static_cast(active_policy.single_tile.block_threads * active_policy.single_tile.items_per_thread)) { // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == nullptr) @@ -786,13 +786,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( #ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", - active_policy.single_tile_policy.block_threads, + active_policy.single_tile.block_threads, (long long) stream, - active_policy.single_tile_policy.items_per_thread); + active_policy.single_tile.items_per_thread); #endif // CUB_DEBUG_LOG // Invoke single_reduce_sweep_kernel - launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream) + launcher_factory(1, active_policy.single_tile.block_threads, 0, stream) .doit(kernel_source.SingleTileKernel(), d_in, d_out, num_items, reduction_op, init, transform_op); // Check for failure to launch diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index 61bcb7c9810..9bc8f654579 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -148,19 +148,19 @@ template ( static_cast(nullptr))), - typename ArchPolicies = arch_policies_from_types, - typename KernelSource = DeviceReduceNondeterministicKernelSource< - ArchPolicies, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT, - InitT, - AccumT, - TransformOpT>, + typename PolicySelector = policy_selector_from_types, + typename KernelSource = DeviceReduceNondeterministicKernelSource< + PolicySelector, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitT, + AccumT, + TransformOpT>, typename KernelLauncherFactory = TripleChevronFactory> #if _CCCL_HAS_CONCEPTS() - requires reduce_policy_hub + requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic( void* d_temp_storage, @@ -172,7 +172,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic( InitT init, cudaStream_t stream, TransformOpT transform_op = {}, - ArchPolicies arch_policies = {}, + PolicySelector policy_selector = {}, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { @@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic( return error; } - const reduce_arch_policy active_policy = arch_policies(arch_id); + const reduce_policy active_policy = policy_selector(arch_id); #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET( NV_IS_HOST, @@ -199,13 +199,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic( } // Init kernel configuration - const int tile_size = active_policy.reduce_nondeterministic_policy.block_threads - * active_policy.reduce_nondeterministic_policy.items_per_thread; + const int tile_size = + active_policy.reduce_nondeterministic.block_threads * active_policy.reduce_nondeterministic.items_per_thread; int sm_occupancy; if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy( sm_occupancy, kernel_source.NondeterministicAtomicKernel(), - active_policy.reduce_nondeterministic_policy.block_threads))) + active_policy.reduce_nondeterministic.block_threads))) { return error; } @@ -232,14 +232,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_nondeterministic( _CubLog("Invoking NondeterministicDeviceReduceAtomicKernel<<<%llu, %d, 0, %p>>>(), %d items " "per thread, %d SM occupancy\n", (unsigned long long) reduce_grid_size, - active_policy.reduce_nondeterministic_policy.block_threads, + active_policy.reduce_nondeterministic.block_threads, (void*) stream, - active_policy.reduce_nondeterministic_policy.items_per_thread, + active_policy.reduce_nondeterministic.items_per_thread, sm_occupancy); #endif // CUB_DEBUG_LOG // Invoke NondeterministicDeviceReduceAtomicKernel - launcher_factory(reduce_grid_size, active_policy.reduce_nondeterministic_policy.block_threads, 0, stream) + launcher_factory(reduce_grid_size, active_policy.reduce_nondeterministic.block_threads, 0, stream) .doit( kernel_source.NondeterministicAtomicKernel(), d_in, d_out, num_items, even_share, reduction_op, init, transform_op); diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 97e413ad349..053aaf21298 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -88,7 +88,7 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_ * @brief Reduce region kernel entry point (multi-block). Computes privatized * reductions, one per thread block. * - * @tparam ArchPolicies + * @tparam PolicySelector * The tuning polices * * @tparam InputIteratorT @@ -123,25 +123,25 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_ * @param[in] reduction_op * Binary reduction functor */ -template #if _CCCL_HAS_CONCEPTS() - requires reduce_policy_hub + requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() -CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int( - ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}) - .reduce_policy.block_threads)) void DeviceReduceKernel(InputIteratorT d_in, - AccumT* d_out, - OffsetT num_items, - GridEvenShare even_share, - ReductionOpT reduction_op, - TransformOpT transform_op) +CUB_DETAIL_KERNEL_ATTRIBUTES +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce.block_threads)) void DeviceReduceKernel( + InputIteratorT d_in, + AccumT* d_out, + OffsetT num_items, + GridEvenShare even_share, + ReductionOpT reduction_op, + TransformOpT transform_op) { - static constexpr agent_reduce_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_policy; + static constexpr agent_reduce_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce; // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20 using agent_policy_t = AgentReducePolicy #if _CCCL_HAS_CONCEPTS() - requires reduce_policy_hub + requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( - int(ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile_policy.block_threads), + int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads), 1) void DeviceReduceSingleTileKernel(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, @@ -234,7 +234,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( InitT init, TransformOpT transform_op) { - static constexpr agent_reduce_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile_policy; + static constexpr agent_reduce_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile; // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20 using agent_policy_t = AgentReducePolicy #if _CCCL_HAS_CONCEPTS() - requires reduce_policy_hub + requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int( - ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}) - .reduce_nondeterministic_policy + PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}) + .reduce_nondeterministic .block_threads)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, @@ -549,7 +549,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int( // Thread block type for reducing input tiles static constexpr agent_reduce_policy policy = - ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_nondeterministic_policy; + PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce_nondeterministic; // TODO(bgruber): pass policy directly as template argument to AgentReduce in C++20 using agent_policy_t = AgentReducePolicy -concept reduce_policy_hub = requires(T hub, ::cuda::arch_id arch) { - { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as; +concept reduce_policy_selector = requires(T hub, ::cuda::arch_id arch) { + { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as; { __needs_a_constexpr_value(hub(arch)) }; }; #endif // _CCCL_HAS_CONCEPTS() @@ -419,17 +417,14 @@ struct policy_hub using MaxPolicy = Policy1000; }; -struct arch_policies // equivalent to the policy_hub, holds policies for a bunch of CUDA architectures +struct policy_selector { accum_type accum_t; // TODO(bgruber): accum_type should become some CCCL global enum op_type operation_t; // TODO(bgruber): op_type should become some CCCL global enum int offset_size; int accum_size; - // IDEA(bgruber): instead of the constexpr function, we could also provide a map and move the - // selection mechanism elsewhere - - [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_arch_policy + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_policy { // if we don't have a tuning for sm100, fall through auto sm100_tuning = get_sm100_tuning(accum_t, operation_t, offset_size, accum_size); @@ -478,16 +473,16 @@ struct arch_policies // equivalent to the policy_hub, holds policies for a bunch }; #if _CCCL_HAS_CONCEPTS() -static_assert(reduce_policy_hub); +static_assert(reduce_policy_selector); #endif // _CCCL_HAS_CONCEPTS() // stateless version which can be passed to kernels template -struct arch_policies_from_types +struct policy_selector_from_types { - [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_arch_policy + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_policy { - constexpr auto policies = arch_policies{ + constexpr auto policies = policy_selector{ classify_accum_type(), classify_op(), int{sizeof(OffsetT)}, int{sizeof(AccumT)}}; return policies(arch); } diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu index cd39f4ca14e..cb2e390b19d 100644 --- a/cub/test/catch2_test_device_reduce_env.cu +++ b/cub/test/catch2_test_device_reduce_env.cu @@ -67,8 +67,8 @@ TEST_CASE("Device reduce works with default environment", "[reduce][device]") REQUIRE(cudaSuccess == cub::detail::ptx_arch_id(arch_id, current_device)); int target_block_size = - cub::detail::reduce::arch_policies_from_types{}(arch_id) - .single_tile_policy.block_threads; + cub::detail::reduce::policy_selector_from_types{}(arch_id) + .single_tile.block_threads; num_items_t num_items = 1; c2h::device_vector d_block_size(1); @@ -107,7 +107,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]") template struct reduce_tuning : cub::detail::reduce::tuning> { - _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_arch_policy + _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy { const auto policy = cub::detail::reduce::agent_reduce_policy{ BlockThreads, 1, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_DEFAULT}; @@ -197,7 +197,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) cudaSuccess == cub::DeviceReduce::Reduce(nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items, op_t{}, init)); - using policy_t = cub::detail::reduce::arch_policies_from_types; + using policy_t = cub::detail::reduce::policy_selector_from_types; return cuda::std::array{ reinterpret_cast( cub::detail::reduce::DeviceReduceSingleTileKernel< @@ -223,7 +223,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) } else if constexpr (cub::detail::is_non_deterministic_v) { - using policy_t = cub::detail::reduce::arch_policies_from_types; + using policy_t = cub::detail::reduce::policy_selector_from_types; auto* raw_ptr = thrust::raw_pointer_cast(d_out.data()); REQUIRE( @@ -316,7 +316,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) { REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items)); - using policy_t = cub::detail::reduce::arch_policies_from_types; + using policy_t = cub::detail::reduce::policy_selector_from_types; return cuda::std::array{ reinterpret_cast( cub::detail::reduce::DeviceReduceSingleTileKernel< @@ -342,7 +342,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) } else if constexpr (cub::detail::is_non_deterministic_v) { - using policy_t = cub::detail::reduce::arch_policies_from_types; + using policy_t = cub::detail::reduce::policy_selector_from_types; auto* raw_ptr = thrust::raw_pointer_cast(d_out.data()); REQUIRE( From a43a607633358f2ecba8a4af990684eb07587276 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 9 Jan 2026 23:46:24 +0100 Subject: [PATCH 2/3] Add /diagnostics:caret --- cmake/CCCLBuildCompilerTargets.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/CCCLBuildCompilerTargets.cmake b/cmake/CCCLBuildCompilerTargets.cmake index dd0aae8dcc5..302efc6bbe1 100644 --- a/cmake/CCCLBuildCompilerTargets.cmake +++ b/cmake/CCCLBuildCompilerTargets.cmake @@ -154,6 +154,9 @@ function(cccl_build_compiler_targets) # See https://github.com/microsoft/STL/issues/696 append_option_if_available("/wd4494" cxx_compile_options) + # Get error messages with a little arrow indicating the error location more exactly + append_option_if_available("/diagnostics:caret" cxx_compile_options) + if (MSVC_TOOLSET_VERSION LESS 143) # winbase.h(9572): warning C5105: macro expansion producing 'defined' has undefined behavior append_option_if_available("/wd5105" cxx_compile_options) From e6dcc837467cca69af522fc713f444b2aa6308b4 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 9 Jan 2026 23:49:57 +0100 Subject: [PATCH 3/3] Add extra parentheses to please MSVC --- libcudacxx/include/cuda/std/__complex/roots.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/libcudacxx/include/cuda/std/__complex/roots.h b/libcudacxx/include/cuda/std/__complex/roots.h index 37a4f012ee7..17e5286b256 100644 --- a/libcudacxx/include/cuda/std/__complex/roots.h +++ b/libcudacxx/include/cuda/std/__complex/roots.h @@ -76,26 +76,20 @@ template // pre-check to see if we over/underflow: _Tp __x_abs_sq = ::cuda::std::fma(__re, __re, __im * __im); - // NVCC 12.9 seems to be eliminating some parentheses which makes MSVC unhappy. - _CCCL_DIAG_PUSH -#if _CCCL_CUDA_COMPILER(NVCC, <, 13, 0) - _CCCL_DIAG_SUPPRESS_MSVC(4554) // warning C4554: '<<': check operator precedence for possible error; use parentheses - // to clarify precedence -#endif // _CCCL_CUDA_COMPILER(NVCC, <, 13, 0) - // Get some bounds where __re +- |__x| won't overflow. // Doesn't need to be too exact, enough to cover extremal cases. // overflow bound = sqrt(MAX_FLOAT / 2) // underflow bound similar, but tweaked to allow for normalizing denormal calculation. + // The static_casts have extra parentheses around them to avoid MSVC's: + // warning C4554: '<<': check operator precedence for possible error; use parentheses to clarify precedence constexpr __uint_t __overflow_bound_exp = - (static_cast<__uint_t>((static_cast<__uint_t>(__max_exponent - 1) >> 1) + __exp_bias) << __mant_nbits) + ((static_cast<__uint_t>(((static_cast<__uint_t>(__max_exponent - 1)) >> 1) + __exp_bias)) << __mant_nbits) | __fp_explicit_bit_mask_of_v<_Tp>; constexpr __uint_t __underflow_bound_exp = - (static_cast<__uint_t>((static_cast<__uint_t>(-__max_exponent + __mant_nbits) >> 1) + __exp_bias) << __mant_nbits) + ((static_cast<__uint_t>(((static_cast<__uint_t>(-__max_exponent + __mant_nbits)) >> 1) + __exp_bias)) + << __mant_nbits) | __fp_explicit_bit_mask_of_v<_Tp>; - _CCCL_DIAG_POP - _Tp __overflow_bound = ::cuda::std::__fp_from_storage<_Tp>(__overflow_bound_exp); _Tp __underflow_bound = ::cuda::std::__fp_from_storage<_Tp>(__underflow_bound_exp);