From 25febbcade60d5eefb5568cdc036c845d29dc932 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Tue, 2 Jul 2024 16:05:38 -0700
Subject: [PATCH] Add throughput metrics for REDUCTION_BENCH/REDUCTION_NVBENCH
 benchmarks (#16126)

This PR addresses https://github.com/rapidsai/cudf/issues/13735 for reduction benchmarks. There are 3 new utils added.

- `int64_t estimate_size(cudf::table_view)` returns a size estimate for the given table. https://github.com/rapidsai/cudf/pull/13984 was a previous attempt to add a similar utility, but this implementation uses `cudf::row_bit_count()` as suggested in https://github.com/rapidsai/cudf/pull/13984#issuecomment-2189916570 instead of manually estimating the size.
- `void set_items_processed(State& state, int64_t items_processed_per_iteration)` is a thin wrapper of `State.SetItemsProcessed()`. This wrapper takes `items_processed_per_iteration` as a parameter instead of `total_items_processed`. This could be useful to avoid repeating `State.iterations() * items_processed_per_iteration` in each benchmark class.
- `void set_throughputs(nvbench::state& state)` is added as a workaround for https://github.com/NVIDIA/nvbench/issues/175. We sometimes want to set throughput statistics after `state.exec()` calls especially when it is hard to estimate the result size upfront.

Here are snippets of reduction benchmarks after this change.

```
$ cpp/build/benchmarks/REDUCTION_BENCH
...
-----------------------------------------------------------------------------------------------------------------
Benchmark                                                       Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------
Reduction/bool_all/10000/manual_time                        10257 ns        26845 ns        68185 bytes_per_second=929.907M/s items_per_second=975.078M/s
Reduction/bool_all/100000/manual_time                       11000 ns        27454 ns        63634 bytes_per_second=8.46642G/s items_per_second=9.09075G/s
Reduction/bool_all/1000000/manual_time                      12671 ns        28658 ns        55261 bytes_per_second=73.5018G/s items_per_second=78.922G/s
...

$ cpp/build/benchmarks/REDUCTION_NVBENCH
...
## rank_scan

### [0] NVIDIA RTX A5500

|        T        | null_probability | data_size | Samples |  CPU Time  | Noise  |  GPU Time  | Noise |  Elem/s  | GlobalMem BW |  BWUtil   |
|-----------------|------------------|-----------|---------|------------|--------|------------|-------|----------|--------------|-----------|
|             I32 |                0 |     10000 |  16992x |  33.544 us | 14.95% |  29.446 us | 5.58% |  82.321M |   5.596 TB/s |   728.54% |
|             I32 |              0.1 |     10000 |  16512x |  34.358 us | 13.66% |  30.292 us | 2.87% |  80.020M |   5.286 TB/s |   688.17% |
|             I32 |              0.5 |     10000 |  16736x |  34.058 us | 14.31% |  29.890 us | 3.40% |  81.097M |   5.430 TB/s |   706.89% |
...
```

Note that, when the data type is a 1-byte-width type in the google benchmark result summary, `bytes_per_second` appears to be smaller than `items_per_second`. This is because the former is a multiple of 1000 whereas the latter is a multiple of 1024. They are in fact the same number.

Implementation-wise, these are what I'm not sure if I made a best decision.
- Each of new utils above is declared and defined in different files. I did this because I could not find a good place to have them all, and they seem to belong to different utilities. Please let me know if there is a better place for them.
- All the new utils are defined in the global namespace since other util functions seem to have been defined in the same way. Please let me know if this is not the convention.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16126
---
 cpp/benchmarks/CMakeLists.txt                 |  9 ++-
 cpp/benchmarks/common/benchmark_utilities.cpp | 27 +++++++++
 cpp/benchmarks/common/benchmark_utilities.hpp | 41 +++++++++++++
 cpp/benchmarks/common/nvbench_utilities.cpp   | 60 +++++++++++++++++++
 cpp/benchmarks/common/nvbench_utilities.hpp   | 31 ++++++++++
 cpp/benchmarks/common/table_utilities.cpp     | 41 +++++++++++++
 cpp/benchmarks/common/table_utilities.hpp     | 41 +++++++++++++
 cpp/benchmarks/reduction/anyall.cpp           |  8 ++-
 cpp/benchmarks/reduction/dictionary.cpp       | 10 +++-
 cpp/benchmarks/reduction/minmax.cpp           | 13 +++-
 cpp/benchmarks/reduction/rank.cpp             | 13 +++-
 cpp/benchmarks/reduction/reduce.cpp           |  8 ++-
 cpp/benchmarks/reduction/scan.cpp             | 11 +++-
 cpp/benchmarks/reduction/scan_structs.cpp     | 16 ++++-
 14 files changed, 314 insertions(+), 15 deletions(-)
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.cpp
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.hpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.cpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.hpp
 create mode 100644 cpp/benchmarks/common/table_utilities.cpp
 create mode 100644 cpp/benchmarks/common/table_utilities.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8a48126e195..a5b248135c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,8 +40,13 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-                               synchronization/synchronization.cpp io/cuio_common.cpp
+  cudf_benchmark_common OBJECT
+  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
+  synchronization/synchronization.cpp
+  io/cuio_common.cpp
+  common/table_utilities.cpp
+  common/benchmark_utilities.cpp
+  common/nvbench_utilities.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
new file mode 100644
index 00000000000..0b9fc17e779
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark_utilities.hpp"
+
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration)
+{
+  state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
+}
+
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration)
+{
+  state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
+}
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
new file mode 100644
index 00000000000..c5c80e73674
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+/**
+ * @brief Sets the number of items processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param items_processed_per_iteration number of items processed per iteration
+ */
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);
+
+/**
+ * @brief Sets the number of bytes processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param bytes_processed_per_iteration number of bytes processed per iteration
+ */
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
new file mode 100644
index 00000000000..c740eaa52f4
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvbench_utilities.hpp"
+
+#include <nvbench/nvbench.cuh>
+
+// This function is copied over from
+// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
+void set_throughputs(nvbench::state& state)
+{
+  double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+
+  if (const auto items = state.get_element_count(); items != 0) {
+    auto& summ = state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("description", "Number of input elements processed per second");
+    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
+  }
+
+  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) {
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
+    {
+      auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("description",
+                      "Number of bytes read/written per second to the CUDA "
+                      "device's global memory");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+
+    {
+      const auto peak_gmem_bw =
+        static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());
+
+      auto& summ = state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description",
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
+      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
+    }
+  }
+}
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
new file mode 100644
index 00000000000..98d879efac5
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvbench {
+struct state;
+}
+
+/**
+ * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
+ * nvbench results summary.
+ *
+ * This function could be used to work around a known issue that the throughput statistics
+ * should be added before the nvbench::state.exec() call, otherwise they will not be printed
+ * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
+ */
+void set_throughputs(nvbench::state& state);
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
new file mode 100644
index 00000000000..a6fbdac9fb8
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_utilities.hpp"
+
+#include <cudf/reduction.hpp>
+#include <cudf/transform.hpp>
+
+#include <cmath>
+
+int64_t estimate_size(cudf::column_view const& col)
+{
+  return estimate_size(cudf::table_view({col}));
+}
+
+int64_t estimate_size(cudf::table_view const& view)
+{
+  // Compute the size in bits for each row.
+  auto const row_sizes = cudf::row_bit_count(view);
+  // Accumulate the row sizes to compute a sum.
+  auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  cudf::data_type sum_dtype{cudf::type_id::INT64};
+  auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
+  auto const total_size_in_bits =
+    static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
+  // Convert the size in bits to the size in bytes.
+  return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
+}
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
new file mode 100644
index 00000000000..04ee847d397
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table_view.hpp>
+
+/**
+ * @brief Estimates the column size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The column view to estimate its size
+ */
+int64_t estimate_size(cudf::column_view const& view);
+
+/**
+ * @brief Estimates the table size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The table view to estimate its size
+ */
+int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 8b1e71c1585..e9d23881764 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces one scalar.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index c1c44c919ac..5095337dbb3 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+
+  // We don't set the metrics for the size read/written as row_bit_count() doesn't
+  // support the dictionary type yet (and so is estimate_size()).
+  // See https://github.com/rapidsai/cudf/issues/16121 for details.
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 963c26692e7..050f2887221 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -28,14 +30,19 @@ template <typename type>
 void BM_reduction(benchmark::State& state)
 {
   cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
+  auto const dtype_id = cudf::type_to_id<type>();
   auto const input_column =
-    create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
+    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto result = cudf::minmax(*input_column);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 2);
+  cudf::data_type dtype = cudf::data_type{dtype_id};
+  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index e55f3b9e09f..14876c80d3e 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
@@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   auto const new_tbl = cudf::repeat(table->view(), 2);
   cudf::column_view input(new_tbl->view().column(0));
 
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result = cudf::detail::inclusive_dense_rank_scan(
+    result = cudf::detail::inclusive_dense_rank_scan(
       input, stream_view, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input.size());
+  state.add_global_memory_reads(estimate_size(input));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 using data_type = nvbench::type_list<int32_t, cudf::list_view>;
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 5bd3e2e3bba..63c96f4fe9e 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*input_column, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 8c9883ece9c..dc05aad9807 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -34,11 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   auto const column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
+  std::unique_ptr<cudf::column> result = nullptr;
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::scan(
+    result = cudf::scan(
       *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   }
+
+  // The benchmark takes a column and produces a new column of the same size as input.
+  set_items_processed(state, n_rows * 2);
+  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index ee97b54fbef..a781f75a314 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
@@ -45,16 +47,24 @@ static void nvbench_structs_scan(nvbench::state& state)
   auto [null_mask, null_count] = create_random_null_mask(size, null_probability);
   auto const input             = cudf::make_structs_column(
     size, std::move(data_table->release()), null_count, std::move(null_mask));
+  auto input_view = input->view();
 
   auto const agg         = cudf::make_min_aggregation<cudf::scan_aggregation>();
   auto const null_policy = static_cast<cudf::null_policy>(state.get_int64("null_policy"));
   auto const stream      = cudf::get_default_stream();
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto const result = cudf::detail::scan_inclusive(
-      *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+    result = cudf::detail::scan_inclusive(
+      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input_view.size());
+  state.add_global_memory_reads(estimate_size(input_view));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 NVBENCH_BENCH(nvbench_structs_scan)