From c64da3141680ad2fa9a1e826e1488872df8cf085 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 4 Jul 2024 02:32:10 +0200
Subject: [PATCH] Add BabelStream flavors as thrust::transform benchmarks
 (#1921)

See BabelStream Thrust implementation:
https://github.com/UoB-PC/BabelStream/blob/main/src/thrust/ThrustStream.cu

Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com>
---
 cub/benchmarks/CMakeLists.txt              |   3 +-
 thrust/benchmarks/CMakeLists.txt           |   4 +
 thrust/benchmarks/bench/transform/basic.cu | 120 +++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt
index 461154fe9e..1c3102d0d7 100644
--- a/cub/benchmarks/CMakeLists.txt
+++ b/cub/benchmarks/CMakeLists.txt
@@ -95,6 +95,7 @@ function(add_bench_dir bench_dir)
     add_bench(base_bench_target ${base_bench_name} "${bench_src}")
     add_dependencies(${benches_meta_target} ${base_bench_target})
     target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1)
+    target_compile_options(${base_bench_target} PRIVATE "--extended-lambda")
 
     if (CUB_ENABLE_TUNING)
       # tuning
@@ -105,7 +106,7 @@ function(add_bench_dir bench_dir)
       add_bench(bench_target ${tuning_name} "${bench_src}")
       # for convenience, make tuning variant buildable by default
       file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
-      target_compile_options(${bench_target} PRIVATE "-include${tuning_path}")
+      target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
     else()
       # benchmarking
       register_cccl_benchmark("${bench_name}" "")
diff --git a/thrust/benchmarks/CMakeLists.txt b/thrust/benchmarks/CMakeLists.txt
index 6a0abd6836..a131f64bd9 100644
--- a/thrust/benchmarks/CMakeLists.txt
+++ b/thrust/benchmarks/CMakeLists.txt
@@ -77,6 +77,10 @@ function(add_bench_dir bench_dir)
       add_bench(base_bench_target ${bench_name} "${real_bench_src}")
       target_link_libraries(${bench_name} PRIVATE ${thrust_target})
       thrust_clone_target_properties(${bench_name} ${thrust_target})
+
+      if ("CUDA" STREQUAL "${config_device}")
+        target_compile_options(${bench_name} PRIVATE "--extended-lambda")
+      endif()
     endforeach()
   endforeach()
 endfunction()
diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
index 1253e3a28f..e2014e5080 100644
--- a/thrust/benchmarks/bench/transform/basic.cu
+++ b/thrust/benchmarks/bench/transform/basic.cu
@@ -29,6 +29,9 @@
 #include <thrust/count.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
 
 #include <nvbench_helper.cuh>
 
@@ -90,3 +93,120 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
   .set_name("base")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
+
+namespace babelstream
+{
+// The benchmarks in this namespace are inspired by the BabelStream thrust version:
+// https://github.com/UoB-HPC/BabelStream/blob/main/src/thrust/ThrustStream.cu
+
+// Modified from BabelStream to also work for integers
+constexpr auto startA      = 1; // BabelStream: 0.1
+constexpr auto startB      = 2; // BabelStream: 0.2
+constexpr auto startC      = 3; // BabelStream: 0.1
+constexpr auto startScalar = 4; // BabelStream: 0.4
+
+using element_types    = nvbench::type_list<std::int8_t, std::int16_t, float, double, __int128>;
+auto array_size_powers = std::vector<std::int64_t>{25};
+
+template <typename T>
+static void mul(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    const T scalar = startScalar;
+    thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
+      return ci * scalar;
+    });
+  });
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types))
+  .set_name("mul")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
+
+template <typename T>
+static void add(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
+      return ai + bi;
+    });
+  });
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types))
+  .set_name("add")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
+
+template <typename T>
+static void triad(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    const T scalar = startScalar;
+    thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
+      return bi + scalar * ci;
+    });
+  });
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types))
+  .set_name("triad")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
+
+template <typename T>
+static void nstream(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    const T scalar = startScalar;
+    thrust::transform(
+      thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
+      thrust::make_zip_iterator(a.end(), b.end(), c.end()),
+      a.begin(),
+      thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {
+        return ai + bi + scalar * ci;
+      }));
+  });
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
+  .set_name("nstream")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
+} // namespace babelstream