Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/iterators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ struct {0}_proxy_t {{
struct {0} {{
using iterator_category = cuda::std::random_access_iterator_tag;
using difference_type = DIFF_T;
using value_type = void;
using value_type = VALUE_T;
using pointer = {0}_proxy_t*;
using reference = {0}_proxy_t;
__device__ {0}_proxy_t operator*() const {{ return {{state}}; }}
Expand Down
225 changes: 88 additions & 137 deletions c/parallel/src/transform.cu

Large diffs are not rendered by default.

20 changes: 13 additions & 7 deletions cub/benchmarks/bench/transform/babelstream.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// %RANGE% TUNE_ALGORITHM alg 0:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:2:1

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

Expand All @@ -25,9 +31,9 @@ using element_types =
#endif

// BabelStream uses 2^25, H200 can fit 2^31 int128s
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
// 2^20 chars / 2^16 int128 saturate V100 (min_bytes_in_flight =12 * SM count =80)
// 2^21 chars / 2^17 int128 saturate A100 (min_bytes_in_flight =16 * SM count =108)
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bytes_in_flight =32or48 * SM count =132)
// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
inline auto array_size_powers = nvbench::range(16, 32, 4);

Expand Down
95 changes: 58 additions & 37 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE && TUNE_ALGORITHM == 2
#if !TUNE_BASE
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 2 (ublkcp) below sm90"
# endif
#endif
# if TUNE_ALGORITHM == 3
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 3 (ublkcp) below sm90"
# endif
# endif // TUNE_ALGORITHM == 3
#endif // !TUNE_BASE

#include <cub/util_namespace.cuh>

Expand All @@ -23,34 +25,51 @@

#include <nvbench_helper.cuh>

template <typename RandomAccessIteratorOut, typename... RandomAccessIteratorsIn>
#if TUNE_BASE
using policy_hub_t =
cub::detail::transform::policy_hub</* stable address */ false,
/* dense output */ true,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut>;
#else
struct policy_hub_t
#if !TUNE_BASE
struct policy_selector
{
struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
{
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
const int min_bytes_in_flight =
cub::detail::transform::arch_to_min_bytes_in_flight(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10}) + TUNE_BIF_BIAS;
# if TUNE_ALGORITHM == 0
static constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
constexpr auto policy = cub::detail::transform::prefetch_policy{
TUNE_THREADS
# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
,
TUNE_ITEMS_PER_THREAD_NO_INPUT
# endif // TUNE_ITEMS_PER_THREAD_NO_INPUT
};
return {min_bytes_in_flight, algorithm, policy, {}, {}};
# elif TUNE_ALGORITHM == 1
static constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
# else
constexpr auto algorithm = cub::detail::transform::Algorithm::vectorized;
constexpr auto policy = cub::detail::transform::vectorized_policy{
TUNE_THREADS,
(1 << TUNE_VEC_SIZE_POW2) * TUNE_VECTORS_PER_THREAD,
(1 << TUNE_VEC_SIZE_POW2)
# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
,
TUNE_ITEMS_PER_THREAD_NO_INPUT
# endif // TUNE_ITEMS_PER_THREAD_NO_INPUT
};
return {min_bytes_in_flight, algorithm, {}, policy, {}};
# elif TUNE_ALGORITHM == 2
constexpr auto algorithm = cub::detail::transform::Algorithm::memcpy_async;
constexpr auto policy =
cub::detail::transform::async_copy_policy{TUNE_THREADS, cub::detail::transform::ldgsts_size_and_align};
return {min_bytes_in_flight, algorithm, {}, {}, policy};
# elif TUNE_ALGORITHM == 3
constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
constexpr auto policy = cub::detail::transform::async_copy_policy{
TUNE_THREADS, cub::detail::transform::bulk_copy_alignment(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10})};
return {min_bytes_in_flight, algorithm, {}, {}, policy};
# else // TUNE_ALGORITHM
# error Policy hub does not yet implement the specified value for algorithm
# endif

using algo_policy =
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
cub::detail::transform::async_copy_policy_t<TUNE_THREADS, __CUDA_ARCH_LIST__ == 900 ? 128 : 16>>;
};
# endif // TUNE_ALGORITHM
}
};
#endif
#endif // !TUNE_BASE

template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
void bench_transform(nvbench::state& state,
Expand All @@ -60,15 +79,17 @@ void bench_transform(nvbench::state& state,
TransformOp transform_op)
{
state.exec(nvbench::exec_tag::gpu, [&](const nvbench::launch& launch) {
cub::detail::transform::dispatch_t<
cub::detail::transform::requires_stable_address::no,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
cub::detail::transform::always_true_predicate,
TransformOp,
policy_hub_t<RandomAccessIteratorOut, RandomAccessIteratorsIn...>>::
dispatch(
inputs, output, num_items, cub::detail::transform::always_true_predicate{}, transform_op, launch.get_stream());
cub::detail::transform::dispatch<cub::detail::transform::requires_stable_address::no>(
inputs,
output,
num_items,
cub::detail::transform::always_true_predicate{},
transform_op,
launch.get_stream()
#if !TUNE_BASE
,
policy_selector{}
#endif // !TUNE_BASE
);
});
}
14 changes: 10 additions & 4 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// %RANGE% TUNE_ALGORITHM alg 0:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

Expand Down
14 changes: 10 additions & 4 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// %RANGE% TUNE_ALGORITHM alg 0:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

Expand Down
22 changes: 18 additions & 4 deletions cub/benchmarks/bench/transform/fill.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// for filling, we can only use the prefetch and the vectorized algorithm
// %RANGE% TUNE_ALGORITHM alg 0:2:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:3:1

// those parameters only apply if TUNE_ALGORITHM == 0 (prefetch)
// %RANGE% TUNE_ITEMS_PER_THREAD_NO_INPUT ipt 1:32:1

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 0 && (TUNE_ITEMS_PER_THREAD_NO_INPUT != 1)
# error "Non-prefetch algorithms require the no input items per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

Expand Down
12 changes: 12 additions & 0 deletions cub/benchmarks/bench/transform/grayscale.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// %RANGE% TUNE_ALGORITHM alg 0:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

template <typename T>
Expand Down
14 changes: 10 additions & 4 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS bif -16:16:4
// %RANGE% TUNE_ALGORITHM alg 0:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE_POW2 vsp2 1:6:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#if !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)
# error "Non-vectorized algorithms require vector size and vectors per thread to be 1 since they ignore the parameters"
#endif // !TUNE_BASE && TUNE_ALGORITHM != 1 && (TUNE_VEC_SIZE_POW2 != 1 || TUNE_VECTORS_PER_THREAD != 1)

#include "common.h"

Expand Down
Loading
Loading