diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt index 461154fe9e..1c3102d0d7 100644 --- a/cub/benchmarks/CMakeLists.txt +++ b/cub/benchmarks/CMakeLists.txt @@ -95,6 +95,7 @@ function(add_bench_dir bench_dir) add_bench(base_bench_target ${base_bench_name} "${bench_src}") add_dependencies(${benches_meta_target} ${base_bench_target}) target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1) + target_compile_options(${base_bench_target} PRIVATE "--extended-lambda") if (CUB_ENABLE_TUNING) # tuning @@ -105,7 +106,7 @@ function(add_bench_dir bench_dir) add_bench(bench_target ${tuning_name} "${bench_src}") # for convenience, make tuning variant buildable by default file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n") - target_compile_options(${bench_target} PRIVATE "-include${tuning_path}") + target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}") else() # benchmarking register_cccl_benchmark("${bench_name}" "") diff --git a/thrust/benchmarks/CMakeLists.txt b/thrust/benchmarks/CMakeLists.txt index 6a0abd6836..a131f64bd9 100644 --- a/thrust/benchmarks/CMakeLists.txt +++ b/thrust/benchmarks/CMakeLists.txt @@ -77,6 +77,10 @@ function(add_bench_dir bench_dir) add_bench(base_bench_target ${bench_name} "${real_bench_src}") target_link_libraries(${bench_name} PRIVATE ${thrust_target}) thrust_clone_target_properties(${bench_name} ${thrust_target}) + + if ("CUDA" STREQUAL "${config_device}") + target_compile_options(${bench_name} PRIVATE "--extended-lambda") + endif() endforeach() endforeach() endfunction() diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu index 1253e3a28f..e2014e5080 100644 --- a/thrust/benchmarks/bench/transform/basic.cu +++ b/thrust/benchmarks/bench/transform/basic.cu @@ -29,6 +29,9 @@ #include #include #include +#include +#include +#include #include @@ -90,3 +93,120 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types)) .set_name("base") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); + +namespace babelstream +{ +// The benchmarks in this namespace are inspired by the BabelStream thrust version: +// https://github.com/UoB-HPC/BabelStream/blob/main/src/thrust/ThrustStream.cu + +// Modified from BabelStream to also work for integers +constexpr auto startA = 1; // BabelStream: 0.1 +constexpr auto startB = 2; // BabelStream: 0.2 +constexpr auto startC = 3; // BabelStream: 0.1 +constexpr auto startScalar = 4; // BabelStream: 0.4 + +using element_types = nvbench::type_list; +auto array_size_powers = std::vector{25}; + +template +static void mul(nvbench::state& state, nvbench::type_list) +{ + const auto n = static_cast(state.get_int64("Elements")); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { + const T scalar = startScalar; + thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) { + return ci * scalar; + }); + }); +} + +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types)) + .set_name("mul") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", array_size_powers); + +template +static void add(nvbench::state& state, nvbench::type_list) +{ + const auto n = static_cast(state.get_int64("Elements")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { + thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) { + return ai + bi; + }); + }); +} + +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types)) + .set_name("add") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", array_size_powers); + +template +static void triad(nvbench::state& state, nvbench::type_list) +{ + const auto n = static_cast(state.get_int64("Elements")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { + const T scalar = startScalar; + thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) { + return bi + scalar * ci; + }); + }); +} + +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types)) + .set_name("triad") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", array_size_powers); + +template +static void nstream(nvbench::state& state, nvbench::type_list) +{ + const auto n = static_cast(state.get_int64("Elements")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()), + thrust::make_zip_iterator(a.end(), b.end(), c.end()), + a.begin(), + thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) { + return ai + bi + scalar * ci; + })); + }); +} + +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types)) + .set_name("nstream") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", array_size_powers); +} // namespace babelstream