NVIDIA · fbusato · Sep 6, 2024 · Aug 15, 2024 · Aug 16, 2024 · Aug 16, 2024
@@ -45,6 +45,7 @@
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <cuda/std/__cccl/dialect.h>
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 #include <cuda/std/functional>
 _CCCL_SUPPRESS_DEPRECATED_POP
@@ -66,5 +67,27 @@ using invoke_result_t =
 template <typename Invokable, typename InitT, typename InputT>
 using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
 
+/**********************************************************************************************************************
+ * Additional type traits
+ **********************************************************************************************************************/
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
+{
+  return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
+{
+  return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename...>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
+{
+  return false;
+}
+
 } // namespace detail
 CUB_NAMESPACE_END
@@ -39,6 +39,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/detail/type_traits.cuh> // always_false
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -50,9 +52,7 @@
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_type.cuh>
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
 #include <cuda/std/functional>
-_CCCL_SUPPRESS_DEPRECATED_POP
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
@@ -413,4 +413,61 @@ _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
   return BinaryFlip<BinaryOpT>(binary_op);
 }
 
+namespace internal
+{
+
+// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+
+template <typename T>
+struct DpxMin
+{
+  static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
+};
+
+template <>
+struct DpxMin<int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmins2(a, b);
+  }
+};
+
+template <>
+struct DpxMin<uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vminu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxMax
+{
+  static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
+};
+
+template <>
+struct DpxMax<int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxs2(a, b);
+  }
+};
+
+template <>
+struct DpxMax<uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxu2(a, b);
+  }
+};
+
+} // namespace internal
+
 CUB_NAMESPACE_END
@@ -35,6 +35,12 @@
 
 #include <cub/config.cuh>
 
+#include <cuda/cmath> // ceil_div
+#include <cuda/std/__cccl/attributes.h> // _CCCL_NODISCARD
+#include <cuda/std/cstdint> // uint16_t
+#include <cuda/std/limits> // numeric_limits
+#include <cuda/std/type_traits> // __enable_if_t
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -43,15 +49,67 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/type_traits.cuh>
-#include <cub/thread/thread_operators.cuh>
+#include <cub/detail/type_traits.cuh> // are_same()
+#include <cub/thread/thread_operators.cuh> // DpxMin
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
 
 CUB_NAMESPACE_BEGIN
 
 /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 namespace internal
 {
 
+/// DPX instructions compute min and max for up to three 16 and 32-bit signed or unsigned integer parameters
+/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx
+/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands
+///       However, all other cases with per-halfword comparison need to be explicitly vectorized
+/// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+///
+/// DPX reduction is enabled if the following conditions are met:
+/// - Hopper+ architectures. DPX instructions are emulated before Hopper
+/// - The number of elements must be large enough for performance reasons (see below)
+/// - All types must be the same
+/// - Only works with integral types of 2 bytes
+/// - DPX instructions provide Min and Max SIMD operations
+/// If the number of instructions is the same, we favor the compiler
+
+template <int LENGTH, typename T, typename ReductionOp, typename PrefixT = T, typename AccumT = T>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off
+constexpr bool enable_dpx_reduction()
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (return (LENGTH == 6 || LENGTH == 8 || LENGTH >= 10) && detail::are_same<T, PrefixT, AccumT>()
+           && detail::is_one_of<T, int16_t, uint16_t>() && detail::is_one_of<ReductionOp, cub::Min, cub::Max>();),
+    (return false;));
+}
+// clang-format on
+
+// Considering compiler vectorization with 3-way reduction, the number of SASS instructions is
+// standard: ceil((L - 3) / 2) + 1
+//   replacing L with L/2 for SIMD
+// DPX:      ceil((L/2 - 3) / 2) + 1 + 1 [for halfword comparison] + L % 2 [for last element]
+//
+// LENGTH | Standard |  DPX
+//  2     |    1     |  NA
+//  3     |    1     |  NA
+//  4     |    2     |  3
+//  5     |    2     |  4
+//  6     |    3     |  2 // *** (3-way comparison for DPX)
+//  7     |    3     |  3
+//  8     |    4     |  3 // ***
+//  9     |    4     |  4
+// 10     |    5     |  3 // ***
+// 11     |    5     |  4 // ***
+// 12     |    6     |  4 // ***
+// 13     |    6     |  5 // ***
+// 14     |    7     |  4 // ***
+// 15     |    7     |  5 // ***
+// 16     |    8     |  5 // ***
+
+//----------------------------------------------------------------------------------------------------------------------
+
 /**
  * @brief Sequential reduction over statically-sized array types
  *
@@ -69,23 +127,32 @@ template <int LENGTH,
           typename ReductionOp,
           typename PrefixT,
           typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+_CCCL_DEVICE
+_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<!enable_dpx_reduction<LENGTH, T, ReductionOp, PrefixT, AccumT>(), AccumT>
 ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
 {
   AccumT retval = prefix;
-
 #pragma unroll
   for (int i = 0; i < LENGTH; ++i)
   {
     retval = reduction_op(retval, input[i]);
   }
-
   return retval;
 }
 
+//----------------------------------------------------------------------------------------------------------------------
+
+/// Specialization for single-element arrays
+template <int LENGTH, typename T, typename ReductionOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::__enable_if_t<LENGTH == 1, T>
+ThreadReduce(T* input, ReductionOp reduction_op)
+{
+  return input[0];
+}
+
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
- *        seeded with the specified @p prefix. The aggregate is returned.
+ * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
+ *        The aggregate is returned.
  *
  * @tparam LENGTH
  *   LengthT of input array
@@ -102,23 +169,48 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH
  *
  * @param[in] reduction_op
  *   Binary reduction operator
- *
- * @param[in] prefix
- *   Prefix to seed reduction with
  */
+template <int LENGTH, typename T, typename ReductionOp>
+_CCCL_DEVICE
+_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<(!enable_dpx_reduction<LENGTH, T, ReductionOp>() && LENGTH > 1), T>
+ThreadReduce(T* input, ReductionOp reduction_op)
+{
+  T prefix = input[0];
+  return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>{});
+}
+
+/// Specialization for DPX reduction
+template <int LENGTH, typename T, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE
+_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<enable_dpx_reduction<LENGTH, T, ReductionOp>(), T>
+ThreadReduce(T* input, ReductionOp reduction_op)
+{
+  constexpr auto IS_MIN = ::cuda::std::is_same<ReductionOp, cub::Min>::value;
+  using DpxReduceOp     = ::cuda::std::_If<IS_MIN, DpxMin<T>, DpxMax<T>>;
+  auto unsigned_input   = reinterpret_cast<unsigned*>(input);
+  auto simd_reduction   = ThreadReduce<LENGTH / 2>(unsigned_input, DpxReduceOp{});
+  T simd_values[2]; // TODO (fbusato): use bit_cast
+  ::memcpy(simd_values, &simd_reduction, sizeof(simd_values));
+  auto ret_value = reduction_op(simd_values[0], simd_values[1]);
+  return (LENGTH % 2 == 0) ? ret_value : reduction_op(ret_value, input[LENGTH - 1]);
+}
+
+/// Specialization for DPX reduction with prefix
 template <int LENGTH,
           typename T,
           typename ReductionOp,
           typename PrefixT,
           typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
+_CCCL_NODISCARD _CCCL_DEVICE
+_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<enable_dpx_reduction<LENGTH, T, ReductionOp, PrefixT, AccumT>(), T>
+ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH>)
 {
-  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+  return reduction_op(ThreadReduce<LENGTH>(input, reduction_op), prefix);
 }
 
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
- *        The aggregate is returned.
+ * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
+ *        seeded with the specified @p prefix. The aggregate is returned.
  *
  * @tparam LENGTH
  *   LengthT of input array
@@ -135,12 +227,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reducti
  *
  * @param[in] reduction_op
  *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
  */
-template <int LENGTH, typename T, typename ReductionOp>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op)
+template <int LENGTH,
+          typename T,
+          typename ReductionOp,
+          typename PrefixT,
+          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
+_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
 {
-  T prefix = input[0];
-  return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 }
 
 /**

@@ -127,7 +127,7 @@ struct WarpReduceShfl
   {
     enum
     {
-      /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per
+      /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per
       /// exchange
       IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
     };

@@ -24,7 +24,6 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
@@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
-// %PARAM% TEST_TYPES types 0:1:2:3
+// %PARAM% TEST_TYPES types 0:1:2:3:4
 
 // List of types to test
 using custom_t =
@@ -72,9 +71,13 @@ type_pair<custom_t>
 #endif
 #if TEST_BF_T
 , type_pair<bfloat16_t> // testing bf16
-#endif
+
 >;
+#endif
 // clang-format on
+#elif TEST_TYPES == 4
+// DPX SIMD instructions
+using full_type_list = c2h::type_list<type_pair<std::uint16_t>, type_pair<std::int16_t>>;
 #endif
 
 /**
@@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
+#if TEST_TYPES != 4
   SECTION("reduce")
   {
     using op_t = cub::Sum;
@@ -145,10 +149,11 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     // Verify result
     REQUIRE(expected_result == out_result[0]);
   }
+#endif // TEST_TYPES != 4
 
 // Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
 // to pseudo associativity of the addition operation over floating point numbers
-#if TEST_TYPES != 3
+#if TEST_TYPES != 3 && TEST_TYPES != 4
   SECTION("sum")
   {
     using op_t    = cub::Sum;
@@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result == out_result[0]);
   }
 
+#if TEST_TYPES != 4
   SECTION("argmax")
   {
     // Prepare verification data
@@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
+#endif
 }