Tune RLE for SM90

gevtushenko · Jun 20, 2023 · b0ae400 · b0ae400
1 parent 81dd8c8
commit b0ae400
Show file tree

Hide file tree

Showing 4 changed files with 433 additions and 59 deletions.
diff --git a/benchmarks/bench/run_length_encode/encode.cu b/benchmarks/bench/run_length_encode/encode.cu
@@ -91,6 +91,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
                                               accum_t,
                                               device_reduce_by_key_policy_hub>;
   #else
+  using policy_t = cub::detail::device_run_length_encode_policy_hub<accum_t, T>;
   using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
                                               unique_output_it_t,
                                               vals_input_it_t,
@@ -99,7 +100,8 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
                                               equality_op_t,
                                               reduction_op_t,
                                               offset_t,
-                                              accum_t>;
+                                              accum_t,
+                                              policy_t>;
   #endif
 
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));

diff --git a/cub/device/device_run_length_encode.cuh b/cub/device/device_run_length_encode.cuh
@@ -34,14 +34,16 @@
 
 #pragma once
 
-#include <iterator>
-#include <stdio.h>
-
 #include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/dispatch_rle.cuh>
+#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
 #include <cub/util_deprecated.cuh>
 
+#include <iterator>
+
+#include <stdio.h>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -204,36 +206,44 @@ struct DeviceRunLengthEncode
          int num_items,
          cudaStream_t stream = 0)
   {
-    using OffsetT      = int;        // Signed integer type for global offsets
-    using EqualityOp   = Equality;   // Default == operator
-    using ReductionOp  = cub::Sum;   // Value reduction operator
+    using offset_t      = int;        // Signed integer type for global offsets
+    using equality_op   = Equality;   // Default == operator
+    using reduction_op  = cub::Sum;   // Value reduction operator
 
     // The lengths output value type
-    using LengthT =
-      cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+    using length_t =
+      cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;
 
     // Generator type for providing 1s values for run-length reduction
-    using LengthsInputIteratorT = ConstantInputIterator<LengthT, OffsetT>;
+    using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
+
+    using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;
+
+    using key_t =
+      cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;
+
+    using policy_t = detail::device_run_length_encode_policy_hub<accum_t, key_t>;
 
     return DispatchReduceByKey<InputIteratorT,
                                UniqueOutputIteratorT,
-                               LengthsInputIteratorT,
+                               lengths_input_iterator_t,
                                LengthsOutputIteratorT,
                                NumRunsOutputIteratorT,
-                               EqualityOp,
-                               ReductionOp,
-                               OffsetT>::Dispatch(d_temp_storage,
-                                                  temp_storage_bytes,
-                                                  d_in,
-                                                  d_unique_out,
-                                                  LengthsInputIteratorT(
-                                                    (LengthT)1),
-                                                  d_counts_out,
-                                                  d_num_runs_out,
-                                                  EqualityOp(),
-                                                  ReductionOp(),
-                                                  num_items,
-                                                  stream);
+                               equality_op,
+                               reduction_op,
+                               offset_t,
+                               accum_t,
+                               policy_t>::Dispatch(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_in,
+                                                   d_unique_out,
+                                                   lengths_input_iterator_t((length_t)1),
+                                                   d_counts_out,
+                                                   d_num_runs_out,
+                                                   equality_op(),
+                                                   reduction_op(),
+                                                   num_items,
+                                                   stream);
   }
 
   template <typename InputIteratorT,

diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
@@ -37,6 +37,7 @@
 #include <cub/agent/agent_rle.cuh>
 #include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_device.cuh>
@@ -150,38 +151,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
  * Dispatch
  ******************************************************************************/
 
-namespace detail
-{
-
-template <class T>
-struct device_rle_policy_hub
-{
-  /// SM35
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 15,
-
-      ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
-                                 CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    using RleSweepPolicyT =
-      AgentRlePolicy<96,
-                     ITEMS_PER_THREAD,
-                     BLOCK_LOAD_DIRECT,
-                     LOAD_LDG,
-                     true,
-                     BLOCK_SCAN_WARP_SCANS,
-                     detail::default_reduce_by_key_delay_constructor_t<int, int>>;
-  };
-
-  using MaxPolicy = Policy350;
-};
-
-} // namespace detail
-
 /**
  * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
  *
@@ -213,8 +182,9 @@ template <typename InputIteratorT,
           typename NumRunsOutputIteratorT,
           typename EqualityOpT,
           typename OffsetT,
-          typename SelectedPolicy =
-            detail::device_rle_policy_hub<cub::detail::value_t<InputIteratorT>>>
+          typename SelectedPolicy = detail::device_non_trivial_runs_policy_hub<
+            cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>,
+            cub::detail::value_t<InputIteratorT>>>
 struct DeviceRleDispatch
 {
   /******************************************************************************