Introduce CUB transform reduce (#1091)

* Introduce CUB transform reduce * Enable graph capture for transform reduce * Typo in transform reduce docs
NVIDIA · Nov 18, 2023 · 61c328a · 61c328a
1 parent aeaa3d3
commit 61c328a
Show file tree

Hide file tree

Showing 9 changed files with 847 additions and 54 deletions.
diff --git a/.clang-format b/.clang-format
@@ -155,6 +155,7 @@ SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard: c++20
 StatementMacros: [
+  '__thrust_exec_check_disable__',
   'CUB_NAMESPACE_BEGIN',
   'CUB_NAMESPACE_END',
   'THRUST_NAMESPACE_BEGIN',

diff --git a/cub/benchmarks/bench/transform_reduce/sum.cu b/cub/benchmarks/bench/transform_reduce/sum.cu
@@ -0,0 +1,205 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "thrust/iterator/transform_iterator.h"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+#ifndef TUNE_BASE
+#define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
+#endif
+
+#if !TUNE_BASE
+template <typename AccumT, typename OffsetT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
+    static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
+    static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
+
+    using ReducePolicy = cub::AgentReducePolicy<threads_per_block,
+                                                items_per_thread,
+                                                AccumT,
+                                                items_per_vec_load,
+                                                cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                                                cub::LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class T>
+struct square_t 
+{
+  __host__ __device__ T operator()(const T& x) const
+  {
+    return x * x;
+  }
+};
+
+#define USE_TRANSPOSE_ITERATOR 0
+
+#if USE_TRANSPOSE_ITERATOR
+template <typename T, typename OffsetT>
+void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
+{
+  using accum_t        = T;
+  using input_it_t     = thrust::transform_iterator<square_t<T>, typename thrust::device_vector<T>::iterator>;
+  using output_it_t    = T*;
+  using offset_t       = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
+  using output_t       = T;
+  using init_t         = T;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<T>;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t, offset_t>;
+  using dispatch_t = cub::
+    DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(1);
+
+  input_it_t d_in   = thrust::make_transform_iterator(in.begin(), square_t<T>{});
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(nullptr,
+                       temp_size,
+                       d_in,
+                       d_out,
+                       static_cast<offset_t>(elements),
+                       reduction_op_t{},
+                       init_t{},
+                       0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto *temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
+    dispatch_t::Dispatch(temp_storage,
+                         temp_size,
+                         d_in,
+                         d_out,
+                         static_cast<offset_t>(elements),
+                         reduction_op_t{},
+                         init_t{},
+                         launch.get_stream());
+  });
+}
+#else
+template <typename T, typename OffsetT>
+void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
+{
+  using accum_t        = T;
+  using input_it_t     = const T*;
+  using output_it_t    = T*;
+  using offset_t       = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
+  using output_t       = T;
+  using init_t         = T;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<T>;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t, offset_t>;
+  using dispatch_t = cub::
+    DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(1);
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(nullptr,
+                       temp_size,
+                       d_in,
+                       d_out,
+                       static_cast<offset_t>(elements),
+                       reduction_op_t{},
+                       init_t{},
+                       0 /* stream */,
+                       transform_op_t{});
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto *temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
+    dispatch_t::Dispatch(temp_storage,
+                         temp_size,
+                         d_in,
+                         d_out,
+                         static_cast<offset_t>(elements),
+                         reduction_op_t{},
+                         init_t{},
+                         launch.get_stream(),
+                         transform_op_t{});
+  });
+}
+#endif
+
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
@@ -53,6 +53,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/functional>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -125,7 +127,8 @@ template <typename AgentReducePolicy,
           typename OutputIteratorT,
           typename OffsetT,
           typename ReductionOp,
-          typename AccumT>
+          typename AccumT,
+          typename TransformOp = ::cuda::std::__identity>
 struct AgentReduce
 {
   //---------------------------------------------------------------------
@@ -189,6 +192,7 @@ struct AgentReduce
   InputIteratorT d_in;                ///< Input data to reduce
   WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
   ReductionOp reduction_op;           ///< Binary reduction operator
+  TransformOp transform_op;          ///< Transform operator
 
   //---------------------------------------------------------------------
   // Utility
@@ -224,11 +228,13 @@ struct AgentReduce
    */
   __device__ __forceinline__ AgentReduce(TempStorage &temp_storage,
                                          InputIteratorT d_in,
-                                         ReductionOp reduction_op)
+                                         ReductionOp reduction_op,
+                                         TransformOp transform_op = {})
       : temp_storage(temp_storage.Alias())
       , d_in(d_in)
       , d_wrapped_in(d_in)
       , reduction_op(reduction_op)
+      , transform_op(transform_op)
   {}
 
   //---------------------------------------------------------------------
@@ -252,9 +258,8 @@ struct AgentReduce
     AccumT items[ITEMS_PER_THREAD];
 
     // Load items in striped fashion
-    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x,
-                                     d_wrapped_in + block_offset,
-                                     items);
+    cub::detail::load_transform_direct_striped<BLOCK_THREADS>(
+      threadIdx.x, d_wrapped_in + block_offset, items, transform_op);
 
     // Reduce items within each thread stripe
     thread_aggregate =
@@ -303,7 +308,7 @@ struct AgentReduce
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
-      items[i] = input_items[i];
+      items[i] = transform_op(input_items[i]);
     }
 
     // Reduce items within each thread stripe
@@ -334,7 +339,7 @@ struct AgentReduce
     // Read first item
     if ((IS_FIRST_TILE) && (thread_offset < valid_items))
     {
-      thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+      thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]);
       thread_offset += BLOCK_THREADS;
     }
 
@@ -343,7 +348,7 @@ struct AgentReduce
     {
       InputT item(d_wrapped_in[block_offset + thread_offset]);
 
-      thread_aggregate = reduction_op(thread_aggregate, item);
+      thread_aggregate = reduction_op(thread_aggregate, transform_op(item));
       thread_offset += BLOCK_THREADS;
     }
   }

diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
@@ -334,6 +334,22 @@ __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
     }
 }
 
+namespace detail
+{
+
+template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT, typename TransformOpT>
+__device__ __forceinline__ void load_transform_direct_striped(
+  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op)
+{
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]);
+  }
+}
+
+} // namespace detail
+
 /**
  * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
  *        by range