Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ CUresult cccl_device_reduce_build_ex(

const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);

const auto cub_arch_policies = [&] {
const auto policy_sel = [&] {
using namespace cub::detail::reduce;

auto accum_type = accum_type::other;
Expand Down Expand Up @@ -251,15 +251,15 @@ CUresult cccl_device_reduce_build_ex(
}

const int offset_size = int{sizeof(OffsetT)};
return arch_policies{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
return policy_selector{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
}();

// TODO(bgruber): drop this if tuning policies become formattable
std::stringstream cub_arch_policies_str;
cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
std::stringstream policy_sel_str;
policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));

auto policy_hub_expr =
std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
std::format("cub::detail::reduce::policy_selector_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);

std::string final_src = std::format(
R"XXX(
Expand All @@ -284,7 +284,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
output_iterator_src, // 4
op_src, // 5
policy_hub_expr, // 6
cub_arch_policies_str.view()); // 7
policy_sel_str.view()); // 7

#if false // CCCL_DEBUGGING_SWITCH
fflush(stderr);
Expand Down Expand Up @@ -371,7 +371,7 @@ static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7
build->cubin_size = result.size;
build->accumulator_size = accum_t.size;
build->determinism = determinism;
build->runtime_policy = new cub::detail::reduce::arch_policies{cub_arch_policies};
build->runtime_policy = new cub::detail::reduce::policy_selector{policy_sel};
}
catch (const std::exception& exc)
{
Expand Down Expand Up @@ -423,7 +423,7 @@ CUresult cccl_device_reduce(
indirect_arg_t{init},
stream,
::cuda::std::identity{},
*static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
*static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
reduce::reduce_kernel_source{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});

Expand Down Expand Up @@ -478,7 +478,7 @@ CUresult cccl_device_reduce_nondeterministic(
indirect_arg_t{init},
stream,
::cuda::std::identity{},
*static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
*static_cast<cub::detail::reduce::policy_selector*>(build.runtime_policy),
reduce::reduce_kernel_source{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});

Expand Down Expand Up @@ -512,7 +512,7 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr

using namespace cub::detail::reduce;
std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
std::unique_ptr<arch_policies> policy(static_cast<arch_policies*>(build_ptr->runtime_policy));
std::unique_ptr<policy_selector> policy(static_cast<policy_selector*>(build_ptr->runtime_policy));
check(cuLibraryUnload(build_ptr->library));
}
catch (const std::exception& exc)
Expand Down
3 changes: 3 additions & 0 deletions cmake/CCCLBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ function(cccl_build_compiler_targets)
# See https://github.com/microsoft/STL/issues/696
append_option_if_available("/wd4494" cxx_compile_options)

# Get error messages with a little arrow indicating the error location more exactly
append_option_if_available("/diagnostics:caret" cxx_compile_options)

if (MSVC_TOOLSET_VERSION LESS 143)
# winbase.h(9572): warning C5105: macro expansion producing 'defined' has undefined behavior
append_option_if_available("/wd5105" cxx_compile_options)
Expand Down
10 changes: 5 additions & 5 deletions cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
#include <nvbench_helper.cuh>

#if !TUNE_BASE
struct arch_policies
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
{
const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
const auto policy = cub::agent_reduce_policy{
Expand Down Expand Up @@ -60,7 +60,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
::cuda::std::execution::prop{
::cuda::execution::__get_tuning_t,
::cuda::std::execution::env{
::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}}
::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_selector{}}}}
# endif
};
static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
Expand All @@ -84,7 +84,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op
#if !TUNE_BASE
,
arch_policies{}
policy_selector{}
#endif
);

Expand All @@ -104,7 +104,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op
#if !TUNE_BASE
,
arch_policies{}
policy_selector{}
#endif
);
});
Expand Down
10 changes: 5 additions & 5 deletions cub/benchmarks/bench/reduce/nondeterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

#if !TUNE_BASE
struct arch_policies
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
{
const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
const auto policy = cub::agent_reduce_policy{
Expand All @@ -24,7 +24,7 @@ struct arch_policies
1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
cub::LOAD_DEFAULT};
return {{}, {}, {}, policy}; // Only reduce_nondeterministic_policy is used
return {{}, {}, {}, policy}; // Only reduce_nondeterministic is used
}
};
#endif // !TUNE_BASE
Expand Down Expand Up @@ -66,7 +66,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op
#if !TUNE_BASE
,
arch_policies{}
policy_selector{}
#endif
);

Expand All @@ -86,7 +86,7 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op
#if !TUNE_BASE
,
arch_policies{}
policy_selector{}
#endif
);
});
Expand Down
8 changes: 4 additions & 4 deletions cub/benchmarks/bench/transform_reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
#endif

#if !TUNE_BASE
struct arch_policies
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_policy
{
const auto [items, threads] = cub::detail::scale_mem_bound(TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD);
const auto policy = cub::agent_reduce_policy{
Expand Down Expand Up @@ -133,7 +133,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op_t{}
# if !TUNE_BASE
,
arch_policies{}
policy_selector{}
# endif
);

Expand All @@ -153,7 +153,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
transform_op_t{}
# if !TUNE_BASE
,
arch_policies{}
policy_selector{}
# endif
);
});
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ private:
using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
TuningEnvT,
detail::reduce::get_tuning_query_t,
detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;

return detail::reduce::dispatch<accum_t>(
d_temp_storage,
Expand Down Expand Up @@ -221,7 +221,7 @@ private:
using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
TuningEnvT,
detail::reduce::get_tuning_query_t,
detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;

return detail::reduce::dispatch_nondeterministic<accum_t>(
d_temp_storage,
Expand Down
58 changes: 29 additions & 29 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN

namespace detail::reduce
{
template <typename ArchPolicies,
template <typename PolicySelector,
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetT,
Expand All @@ -55,12 +55,12 @@ template <typename ArchPolicies,
typename TransformOpT>
struct DeviceReduceKernelSource
{
// ArchPolicies must be stateless, so we can pass the type to the kernel
static_assert(::cuda::std::is_empty_v<ArchPolicies>);
// PolicySelector must be stateless, so we can pass the type to the kernel
static_assert(::cuda::std::is_empty_v<PolicySelector>);

CUB_DEFINE_KERNEL_GETTER(
SingleTileKernel,
DeviceReduceSingleTileKernel<ArchPolicies,
DeviceReduceSingleTileKernel<PolicySelector,
InputIteratorT,
OutputIteratorT,
OffsetT,
Expand All @@ -70,11 +70,11 @@ struct DeviceReduceKernelSource
TransformOpT>)

CUB_DEFINE_KERNEL_GETTER(
ReductionKernel, DeviceReduceKernel<ArchPolicies, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
ReductionKernel, DeviceReduceKernel<PolicySelector, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)

CUB_DEFINE_KERNEL_GETTER(
SingleTileSecondKernel,
DeviceReduceSingleTileKernel<ArchPolicies,
DeviceReduceSingleTileKernel<PolicySelector,
AccumT*,
OutputIteratorT,
int, // Always used with int offsets
Expand All @@ -90,15 +90,15 @@ struct DeviceReduceKernelSource

// TODO(bgruber): remove in CCCL 4.0
template <typename PolicyHub>
struct arch_policies_from_hub
struct policy_selector_from_hub
{
// this is only called in device code, so we can ignore the arch parameter
_CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_arch_policy
_CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_policy
{
using ap = typename PolicyHub::MaxPolicy::ActivePolicy;
using ap_reduce = typename ap::ReducePolicy;
using ap_single_tile = typename ap::SingleTilePolicy;
return reduce_arch_policy{
return reduce_policy{
agent_reduce_policy{
ap_reduce::BLOCK_THREADS,
ap_reduce::ITEMS_PER_THREAD,
Expand Down Expand Up @@ -153,7 +153,7 @@ template <typename InputIteratorT,
typename TransformOpT = ::cuda::std::identity,
typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename KernelSource = detail::reduce::DeviceReduceKernelSource<
detail::reduce::arch_policies_from_hub<PolicyHub>,
detail::reduce::policy_selector_from_hub<PolicyHub>,
InputIteratorT,
OutputIteratorT,
OffsetT,
Expand Down Expand Up @@ -594,7 +594,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
InitT init,
cudaStream_t stream,
TransformOpT transform_op,
reduce_arch_policy active_policy,
reduce_policy active_policy,
KernelSource kernel_source,
KernelLauncherFactory launcher_factory)
{
Expand All @@ -606,10 +606,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
}

// Init regular kernel configuration
const auto tile_size = active_policy.reduce_policy.block_threads * active_policy.reduce_policy.items_per_thread;
const auto tile_size = active_policy.reduce.block_threads * active_policy.reduce.items_per_thread;
int sm_occupancy;
if (const auto error = CubDebug(launcher_factory.MaxSmOccupancy(
sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce_policy.block_threads)))
sm_occupancy, kernel_source.ReductionKernel(), active_policy.reduce.block_threads)))
{
return error;
}
Expand Down Expand Up @@ -652,14 +652,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
_CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
"per thread, %d SM occupancy\n",
(unsigned long) reduce_grid_size,
active_policy.reduce_policy.block_threads,
active_policy.reduce.block_threads,
(long long) stream,
active_policy.reduce_policy.items_per_thread,
active_policy.reduce.items_per_thread,
sm_occupancy);
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
launcher_factory(reduce_grid_size, active_policy.reduce_policy.block_threads, 0, stream)
launcher_factory(reduce_grid_size, active_policy.reduce.block_threads, 0, stream)
.doit(kernel_source.ReductionKernel(), d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op);

// Check for failure to launch
Expand All @@ -678,13 +678,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
active_policy.single_tile_policy.block_threads,
active_policy.single_tile.block_threads,
(long long) stream,
active_policy.single_tile_policy.items_per_thread);
active_policy.single_tile.items_per_thread);
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceSingleTileKernel
launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
.doit(kernel_source.SingleTileSecondKernel(),
d_block_reductions,
d_out,
Expand Down Expand Up @@ -736,12 +736,12 @@ template <
typename TransformOpT = ::cuda::std::identity,
typename AccumT =
decltype(select_accum_t<InputIteratorT, InitT, ReductionOpT, TransformOpT>(static_cast<OverrideAccumT*>(nullptr))),
typename ArchPolicies = arch_policies_from_types<AccumT, OffsetT, ReductionOpT>,
typename PolicySelector = policy_selector_from_types<AccumT, OffsetT, ReductionOpT>,
typename KernelSource =
DeviceReduceKernelSource<ArchPolicies, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
DeviceReduceKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>,
typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
#if _CCCL_HAS_CONCEPTS()
requires reduce_policy_hub<ArchPolicies>
requires reduce_policy_selector<PolicySelector>
#endif // _CCCL_HAS_CONCEPTS()
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
void* d_temp_storage,
Expand All @@ -753,7 +753,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
InitT init,
cudaStream_t stream,
TransformOpT transform_op = {},
ArchPolicies arch_policies = {},
PolicySelector policy_selector = {},
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
Expand All @@ -764,16 +764,16 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
return error;
}

const reduce_arch_policy active_policy = arch_policies(arch_id);
const reduce_policy active_policy = policy_selector(arch_id);
#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
NV_IF_TARGET(NV_IS_HOST,
(std::stringstream ss; ss << active_policy;
_CubLog("Dispatching DeviceReduce to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)

// Check for small, single tile size
if (num_items <= static_cast<OffsetT>(
active_policy.single_tile_policy.block_threads * active_policy.single_tile_policy.items_per_thread))
if (num_items
<= static_cast<OffsetT>(active_policy.single_tile.block_threads * active_policy.single_tile.items_per_thread))
{
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == nullptr)
Expand All @@ -786,13 +786,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
active_policy.single_tile_policy.block_threads,
active_policy.single_tile.block_threads,
(long long) stream,
active_policy.single_tile_policy.items_per_thread);
active_policy.single_tile.items_per_thread);
#endif // CUB_DEBUG_LOG

// Invoke single_reduce_sweep_kernel
launcher_factory(1, active_policy.single_tile_policy.block_threads, 0, stream)
launcher_factory(1, active_policy.single_tile.block_threads, 0, stream)
.doit(kernel_source.SingleTileKernel(), d_in, d_out, num_items, reduction_op, init, transform_op);

// Check for failure to launch
Expand Down
Loading