diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu
new file mode 100644
index 0000000000..a6c149ffdd
--- /dev/null
+++ b/cub/benchmarks/bench/reduce/min.cu
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures.
+//       It specifically uses cub::Min instead of a user-defined operator.
+#define TUNE_T int16_t
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+using op_t = cub::Min;
+#include "base.cuh"
diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index ed505bb1fc..12dce69c13 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -50,6 +50,8 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH
 _CCCL_SUPPRESS_DEPRECATED_POP
 #include <cuda/std/type_traits>
 
+#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr
+
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
@@ -62,5 +64,101 @@ using invoke_result_t =
   ::cuda::std::invoke_result_t<Invokable, Args...>;
 #endif
 
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
+{
+  return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
+{
+  return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename...>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
+{
+  return false;
+}
+
+template <typename T, typename V, typename = void>
+struct has_binary_call_operator : ::cuda::std::false_type
+{};
+
+template <typename T, typename V>
+struct has_binary_call_operator<
+  T,
+  V,
+  ::cuda::std::void_t<decltype(::cuda::std::declval<T>()(::cuda::std::declval<V>(), ::cuda::std::declval<V>()))>>
+    : ::cuda::std::true_type
+{};
+
+/***********************************************************************************************************************
+ * Array like type traits
+ **********************************************************************************************************************/
+
+template <typename T, typename = void>
+struct has_subscript : ::cuda::std::false_type
+{};
+
+template <typename T>
+struct has_subscript<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>()[0])>> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_subscript_t = typename has_subscript<T>::type;
+
+template <typename T, typename = void>
+struct has_size : ::cuda::std::false_type
+{};
+
+// TODO: use ::cuda::std::size(::cuda::std::declval<T>()) when std::size will be available in libcu++
+template <typename T>
+struct has_size<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>().size())>> : ::cuda::std::true_type
+{};
+
+template <typename T, ::cuda::std::size_t N>
+struct has_size<T[N], void> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_size_t = typename has_size<T>::type;
+
+/***********************************************************************************************************************
+ * StaticSize: a type trait that returns the number of elements in an Array-like type
+ **********************************************************************************************************************/
+// StaticSize is useful where size(obj) cannot be checked at compile time
+// e.g.
+// using Array = NonTriviallyConstructible[8];
+// std::size(Array{})   // compile error
+// static_size<Array>() // ok
+
+template <typename T, typename = void>
+struct StaticSize
+{
+  static_assert(detail::always_false<T>(), "StaticSize not supported for this type");
+};
+
+template <typename T>
+struct StaticSize<T,
+                  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, ::cuda::std::declval<T>().size()>{})>>
+{
+  static_assert(::cuda::std::is_trivially_constructible<T>::value, "T must be trivially constructible");
+  static constexpr auto value = T{}.size();
+};
+
+template <typename T, ::cuda::std::size_t N>
+struct StaticSize<T[N], void>
+{
+  static constexpr auto value = N;
+};
+
+template <typename T>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size()
+{
+  return StaticSize<T>::value;
+}
+
 } // namespace detail
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 21ed8592d6..4df4b49ac0 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -47,14 +47,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // always_false
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_type.cuh>
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-#include <cuda/std/functional>
-_CCCL_SUPPRESS_DEPRECATED_POP
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/type_traits> // cuda::std::common_type
+#include <cuda/std/utility> // cuda::std::forward
+
+// #include <functional> // std::plus
 
 CUB_NAMESPACE_BEGIN
 
@@ -413,4 +414,121 @@ _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
   return BinaryFlip<BinaryOpT>(binary_op);
 }
 
+namespace internal
+{
+// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+
+template <typename T>
+struct DpxMin
+{
+  static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
+};
+
+template <>
+struct DpxMin<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmins2(a, b);
+  }
+};
+
+template <>
+struct DpxMin<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vminu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxMax
+{
+  static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
+};
+
+template <>
+struct DpxMax<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxs2(a, b);
+  }
+};
+
+template <>
+struct DpxMax<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxSum
+{
+  static_assert(detail::always_false<T>(), "DpxSum is not supported for this type");
+};
+
+template <>
+struct DpxSum<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+template <>
+struct DpxSum<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename ReduceOp, typename T>
+struct CubOperatorToDpx
+{
+  static_assert(detail::always_false<T>(), "Dpx is not supported for this operator");
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Min, T>
+{
+  using type = DpxMin<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Max, T>
+{
+  using type = DpxMax<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Sum, T>
+{
+  using type = DpxSum<T>;
+};
+
+// template <typename T>
+// struct CubOperatorToDpx<std::plus<T>, T>
+//{
+//   using type = DpxSum<T>;
+// };
+
+template <typename ReduceOp, typename T>
+using cub_operator_to_dpx_t = CubOperatorToDpx<ReduceOp, T>;
+
+} // namespace internal
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 7ac9836925..a956321f78 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -28,7 +28,7 @@
 
 /**
  * @file
- * Thread utilities for sequential reduction over statically-sized array types
+ * Thread reduction over statically-sized array-like types
  */
 
 #pragma once
@@ -43,8 +43,17 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/type_traits.cuh>
-#include <cub/thread/thread_operators.cuh>
+#include <cub/detail/type_traits.cuh> // are_same()
+#include <cub/thread/thread_operators.cuh> // cub_operator_to_dpx_t
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/bit> // bit_cast
+#include <cuda/std/cstdint> // uint16_t
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/utility> // pair
+
+// #include <functional> // std::plus
 
 CUB_NAMESPACE_BEGIN
 
@@ -52,51 +61,143 @@ CUB_NAMESPACE_BEGIN
 namespace internal
 {
 
-/**
- * @brief Sequential reduction over statically-sized array types
- *
- * @param[in] input
- *   Input array
- *
- * @param[in] reduction_op
- *   Binary reduction operator
- *
- * @param[in] prefix
- *   Prefix to seed reduction with
- */
-template <int LENGTH,
-          typename T,
-          typename ReductionOp,
-          typename PrefixT,
-          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
-ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/// DPX instructions compute min, max, and sum for up to three 16 and 32-bit signed or unsigned integer parameters
+/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx
+/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands
+///       However, all other cases with per-halfword comparison need to be explicitly vectorized
+/// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+///
+/// DPX reduction is enabled if the following conditions are met:
+/// - Hopper+ architectures. DPX instructions are emulated before Hopper
+/// - The number of elements must be large enough for performance reasons (see below)
+/// - All types must be the same
+/// - Only works with integral types of 2 bytes
+/// - DPX instructions provide Min, Max, and Sum SIMD operations
+/// If the number of instructions is the same, we favor the compiler
+
+template <typename Input, typename ReductionOp, typename AccumT>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off
+constexpr bool enable_dpx_reduction()
 {
-  AccumT retval = prefix;
+  using T = decltype(::cuda::std::declval<Input>()[0]);
+  // TODO: use constexpr variable in C++14+
+  using Lenght = ::cuda::std::integral_constant<int, detail::static_size<Input>()>;
+  return ((Lenght{} >= 9 && detail::are_same<ReductionOp, cub::Sum/*, std::plus<T>*/>()) || Lenght{} >= 10)
+            && detail::are_same<T, AccumT>()
+            && detail::is_one_of<T, int16_t, uint16_t>()
+            && detail::is_one_of<ReductionOp, cub::Min, cub::Max, cub::Sum/*, std::plus<T>*/>();
+}
+// clang-format on
 
-#pragma unroll
-  for (int i = 0; i < LENGTH; ++i)
+// Considering compiler vectorization with 3-way comparison, the number of SASS instructions is
+// Standard: ceil((L - 3) / 2) + 1
+//   replacing L with L/2 for SIMD
+// DPX:      ceil((L/2 - 3) / 2) + 1 + 2 [for halfword comparison: PRMT, VIMNMX] + L % 2 [for last element]
+//   finally, the last two comparision operations are vectorized in a 3-way reduction
+//           ceil((L/2 - 3) / 2) + 3
+//
+// length | Standard |  DPX
+//  2     |    1     |  NA
+//  3     |    1     |  NA
+//  4     |    2     |  3
+//  5     |    2     |  3
+//  6     |    3     |  3
+//  7     |    3     |  3
+//  8     |    4     |  4
+//  9     |    4     |  4
+// 10     |    5     |  4 // ***
+// 11     |    5     |  4 // ***
+// 12     |    6     |  5 // ***
+// 13     |    6     |  5 // ***
+// 14     |    7     |  5 // ***
+// 15     |    7     |  5 // ***
+// 16     |    8     |  6 // ***
+
+template <typename AccumT, typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduceSequential(const Input& input, ReductionOp reduction_op)
+{
+  AccumT retval = input[0];
+#  pragma unroll
+  for (int i = 1; i < detail::static_size<Input>(); ++i)
   {
     retval = reduction_op(retval, input[i]);
   }
-
   return retval;
 }
 
+/// Specialization for DPX reduction
+template <typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
+ThreadReduceDpx(const Input& input, ReductionOp reduction_op) -> ::cuda::std::__remove_cvref_t<decltype(input[0])>
+{
+  using T              = ::cuda::std::__remove_cvref_t<decltype(input[0])>;
+  constexpr int length = detail::static_size<Input>();
+  T array[length];
+#  pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i] = input[i];
+  }
+  using DpxReduceOp   = cub_operator_to_dpx_t<ReductionOp, T>;
+  using SimdType      = ::cuda::std::pair<T, T>;
+  auto unsigned_input = reinterpret_cast<const unsigned*>(array);
+  auto simd_reduction = ThreadReduceSequential<length / 2>(unsigned_input, DpxReduceOp{});
+  auto simd_values    = ::cuda::std::bit_cast<SimdType>(simd_reduction);
+  auto ret_value      = reduction_op(simd_values.first, simd_values.second);
+  return (length % 2 == 0) ? ret_value : reduction_op(ret_value, input[length - 1]);
+}
+
+// DPX/Sequential dispatch
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(sizeof(Input) != sizeof(Input), "a");
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (return ThreadReduceDpx(input, reduction_op);),
+               (return ThreadReduceSequential<AccumT>(input, reduction_op);))
+}
+
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(!enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  return ThreadReduceSequential<AccumT>(input, reduction_op);
+}
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
- *        seeded with the specified @p prefix. The aggregate is returned.
+ * @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix.
  *
- * @tparam LENGTH
- *   LengthT of input array
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ * @tparam Input
+ *   <b>[inferred]</b> The data type to be reduced having member
+ *   <tt>operator[](int i)</tt> and must be statically-sized (size() method or static array)
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
  * @param[in] input
  *   Input array
  *
@@ -105,101 +206,122 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH
  *
  * @param[in] prefix
  *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT></tt>
  */
-template <int LENGTH,
-          typename T,
+template <typename Input,
           typename ReductionOp,
           typename PrefixT,
-          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
 {
-  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  constexpr int length = detail::static_size<Input>();
+  // copy to a temporary array of type AccumT
+  AccumT array[length + 1];
+  array[0] = prefix;
+#pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i + 1] = input[i];
+  }
+  return ThreadReduce<decltype(array), ReductionOp, AccumT, AccumT>(array, reduction_op);
 }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
- *        The aggregate is returned.
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
  *
- * @tparam LENGTH
- *   LengthT of input array
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer. The aggregate is returned.
  *
  * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ *   <b>[inferred]</b> The data type to be reduced
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
  * @param[in] input
- *   Input array
+ *   Input pointer
  *
  * @param[in] reduction_op
  *   Binary reduction operator
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T></tt>
  */
-template <int LENGTH, typename T, typename ReductionOp>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op)
+template <int Length, typename T, typename ReductionOp, typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* input, ReductionOp reduction_op)
 {
-  T prefix = input[0];
-  return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+  static_assert(Length > 0, "Length must be greater than 0");
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  using ArrayT = T[Length];
+  auto array   = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op);
 }
 
 /**
- * @brief Perform a sequential reduction over the statically-sized @p input array,
- *        seeded with the specified @p prefix. The aggregate is returned.
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
+ *
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer, seeded with the specified @p
+ *        prefix. The aggregate is returned.
  *
- * @tparam LENGTH
- *   <b>[inferred]</b> LengthT of @p input array
+ * @tparam length
+ *   Length of input pointer
  *
  * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ *   <b>[inferred]</b> The data type to be reduced
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
  * @param[in] input
- *   Input array
+ *   Input pointer
  *
  * @param[in] reduction_op
  *   Binary reduction operator
  *
  * @param[in] prefix
  *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T, PrefixT></tt>
  */
-template <int LENGTH,
+template <int Length,
           typename T,
           typename ReductionOp,
           typename PrefixT,
-          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix)
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>,
+          _CUB_TEMPLATE_REQUIRES(Length > 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
 {
-  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  auto array = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op, prefix);
 }
 
-/**
- * @brief Serial reduction with the specified operator
- *
- * @tparam LENGTH
- *   <b>[inferred]</b> LengthT of @p input array
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
- *
- * @tparam ReductionOp
- *   <b>[inferred]</b> Binary reduction operator type having member
- *   <tt>T operator()(const T &a, const T &b)</tt>
- *
- * @param[in] input
- *   Input array
- *
- * @param[in] reduction_op
- *   Binary reduction operator
- */
-template <int LENGTH, typename T, typename ReductionOp>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op)
+template <int Length, typename T, typename ReductionOp, typename PrefixT, _CUB_TEMPLATE_REQUIRES(Length == 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix)
 {
-  return ThreadReduce<LENGTH>((T*) input, reduction_op);
+  return prefix;
 }
 
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
 } // namespace internal
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index 41b23e6dff..fdd4083c37 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -127,7 +127,7 @@ struct WarpReduceShfl
   {
     enum
     {
-      /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per
+      /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per
       /// exchange
       IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
     };
diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu
index 290e8d8f6a..bfd7c3e8a2 100644
--- a/cub/test/catch2_test_device_reduce.cu
+++ b/cub/test/catch2_test_device_reduce.cu
@@ -24,7 +24,6 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
@@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
-// %PARAM% TEST_TYPES types 0:1:2:3
+// %PARAM% TEST_TYPES types 0:1:2:3:4
 
 // List of types to test
 using custom_t =
@@ -72,9 +71,13 @@ type_pair<custom_t>
 #endif
 #if TEST_BF_T
 , type_pair<bfloat16_t> // testing bf16
-#endif
+
 >;
+#endif
 // clang-format on
+#elif TEST_TYPES == 4
+// DPX SIMD instructions
+using full_type_list = c2h::type_list<type_pair<std::uint16_t>, type_pair<std::int16_t>>;
 #endif
 
 /**
@@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
+#if TEST_TYPES != 4
   SECTION("reduce")
   {
     using op_t = cub::Sum;
@@ -145,6 +149,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     // Verify result
     REQUIRE(expected_result == out_result[0]);
   }
+#endif // TEST_TYPES != 4
 
 // Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
 // to pseudo associativity of the addition operation over floating point numbers
@@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result == out_result[0]);
   }
 
+#if TEST_TYPES != 4
   SECTION("argmax")
   {
     // Prepare verification data
@@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
+#endif
 }
diff --git a/docs/repo.toml b/docs/repo.toml
index 5ef5eed3b7..f4c7fa4d77 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -145,7 +145,9 @@ doxygen_predefined = [
     "_CCCL_DEVICE",
     "_CCCL_HOST_DEVICE",
     "_CCCL_FORCEINLINE",
+    "_CUB_TEMPLATE_REQUIRES(x)",
     "_CCCL_STD_VER",
+    "_CCCL_NODISCARD",
     "_CCCL_VISIBILITY_HIDDEN",
     "_CCCL_SUPPRESS_DEPRECATED_PUSH",
     "_CCCL_SUPPRESS_DEPRECATED_POP",