Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUB - Enable DPX Reduction #2286

Merged
merged 40 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5bff5ed
add segmented [radix] sort tests for 64-bit indices
fbusato Aug 15, 2024
62ac106
Add test for device segmented sort pairs with 64-bit indices
fbusato Aug 16, 2024
994da36
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 16, 2024
5c0a167
greatly simplify the code by exploiting automatic compiler vectorization
fbusato Aug 23, 2024
d2ef0c9
Merge branch 'NVIDIA:main' into cub/dpx-reduction
fbusato Aug 26, 2024
7c18ab6
Address compatibility issues and PR suggestions
fbusato Aug 26, 2024
2112f03
fix formatting issues
fbusato Aug 26, 2024
bb4cd47
Merge branch 'NVIDIA:main' into cub/dpx-reduction
fbusato Aug 27, 2024
df45811
Add DPX benchmarking file
fbusato Aug 27, 2024
c8c5233
Apply bit_cast and minor fixes
fbusato Aug 27, 2024
718f263
Slightly improve heuristics to enable DPX
fbusato Aug 27, 2024
17e29f0
Merge remote-tracking branch 'upstream/main' into cub/dpx-reduction
fbusato Aug 28, 2024
285885a
merge cuda::std::__accumulator_t
fbusato Aug 28, 2024
4638ac7
Add Sum operator for DPX
fbusato Aug 29, 2024
32aaf6e
move enable_if_t to template type as workaround for MSVC
fbusato Aug 29, 2024
da64d59
Fix formatting issues
fbusato Aug 30, 2024
91febd6
Merge branch 'main' into cub/dpx-reduction
fbusato Aug 30, 2024
8e20651
Merge branch 'main' into cub/dpx-reduction
fbusato Aug 30, 2024
488f590
Fix documentation issues
fbusato Aug 30, 2024
923201e
Merge branch 'cub/dpx-reduction' of github.com:fbusato/cccl into cub/…
fbusato Aug 30, 2024
1a74d2b
Fix formatting issues
fbusato Aug 30, 2024
d90d444
replace SFINAE with if constexpr where possible
fbusato Aug 31, 2024
879083c
Merge branch 'main' into cub/dpx-reduction
fbusato Sep 3, 2024
5c9a3af
rewrote `thread_reduce.cuh`
fbusato Sep 4, 2024
56aa97f
Merge branch 'cub/dpx-reduction' of github.com:fbusato/cccl into cub/…
fbusato Sep 4, 2024
a9aeeb9
Merge branch 'main' into cub/dpx-reduction
fbusato Sep 4, 2024
623849e
Address most of the suggestions
fbusato Sep 4, 2024
c9b231b
Merge branch 'cub/dpx-reduction' of github.com:fbusato/cccl into cub/…
fbusato Sep 4, 2024
59b0e6e
Merge branch 'main' into cub/dpx-reduction
fbusato Sep 4, 2024
fe89cf5
Merge branch 'main' into cub/dpx-reduction
fbusato Sep 4, 2024
3faf4ba
Add deprecated remark
fbusato Sep 4, 2024
84fbc14
Merge branch 'cub/dpx-reduction' of github.com:fbusato/cccl into cub/…
fbusato Sep 4, 2024
b0c4388
doxygen fixes
fbusato Sep 4, 2024
b8a7033
try to fix MSVC 14.16
fbusato Sep 4, 2024
3bbe5f4
Add pointer interface remark
fbusato Sep 4, 2024
629381d
fix undefined behavior
fbusato Sep 5, 2024
0fbf1d0
Merge branch 'main' into cub/dpx-reduction
fbusato Sep 5, 2024
0dfbabf
removed std::plus
fbusato Sep 5, 2024
89921b2
Merge branch 'cub/dpx-reduction' of github.com:fbusato/cccl into cub/…
fbusato Sep 5, 2024
99f0548
Fix forrmatting issues
fbusato Sep 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cub/cub/detail/type_traits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include <cub/util_cpp_dialect.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/std/__cccl/dialect.h>
fbusato marked this conversation as resolved.
Show resolved Hide resolved
_CCCL_SUPPRESS_DEPRECATED_PUSH
#include <cuda/std/functional>
_CCCL_SUPPRESS_DEPRECATED_POP
Expand All @@ -66,5 +67,27 @@ using invoke_result_t =
template <typename Invokable, typename InitT, typename InputT>
using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;

/**********************************************************************************************************************
* Additional type traits
**********************************************************************************************************************/

fbusato marked this conversation as resolved.
Show resolved Hide resolved
template <typename T, typename... TArgs>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
{
return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
}

template <typename T, typename... TArgs>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
{
return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
}

template <typename...>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
{
return false;
}

} // namespace detail
CUB_NAMESPACE_END
61 changes: 59 additions & 2 deletions cub/cub/thread/thread_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@

#include <cub/config.cuh>

#include <cub/detail/type_traits.cuh> // always_false

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand All @@ -50,9 +52,7 @@
#include <cub/util_cpp_dialect.cuh>
#include <cub/util_type.cuh>

_CCCL_SUPPRESS_DEPRECATED_PUSH
#include <cuda/std/functional>
_CCCL_SUPPRESS_DEPRECATED_POP
#include <cuda/std/type_traits>
#include <cuda/std/utility>

Expand Down Expand Up @@ -413,4 +413,61 @@ _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
return BinaryFlip<BinaryOpT>(binary_op);
}

namespace internal
{

// TODO: Remove DPX specilization when nvbug 4823237 is fixed

template <typename T>
struct DpxMin
{
static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
};

template <>
struct DpxMin<int16_t>
{
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
{
return __vmins2(a, b);
}
};

template <>
struct DpxMin<uint16_t>
{
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
{
return __vminu2(a, b);
}
};

//----------------------------------------------------------------------------------------------------------------------

template <typename T>
struct DpxMax
{
static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
};

template <>
struct DpxMax<int16_t>
fbusato marked this conversation as resolved.
Show resolved Hide resolved
{
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
{
return __vmaxs2(a, b);
}
};

template <>
struct DpxMax<uint16_t>
{
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
{
return __vmaxu2(a, b);
}
};

} // namespace internal

CUB_NAMESPACE_END
134 changes: 116 additions & 18 deletions cub/cub/thread/thread_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@

#include <cub/config.cuh>

#include <cuda/cmath> // ceil_div
#include <cuda/std/__cccl/attributes.h> // _CCCL_NODISCARD
#include <cuda/std/cstdint> // uint16_t
#include <cuda/std/limits> // numeric_limits
#include <cuda/std/type_traits> // __enable_if_t

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand All @@ -43,15 +49,67 @@
# pragma system_header
#endif // no system header

#include <cub/detail/type_traits.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/detail/type_traits.cuh> // are_same()
#include <cub/thread/thread_operators.cuh> // DpxMin
#include <cub/util_namespace.cuh>
#include <cub/util_type.cuh>

CUB_NAMESPACE_BEGIN

/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
namespace internal
{

/// DPX instructions compute min and max for up to three 16 and 32-bit signed or unsigned integer parameters
/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx
/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands
/// However, all other cases with per-halfword comparison need to be explicitly vectorized
/// TODO: Remove DPX specilization when nvbug 4823237 is fixed
///
/// DPX reduction is enabled if the following conditions are met:
/// - Hopper+ architectures. DPX instructions are emulated before Hopper
/// - The number of elements must be large enough for performance reasons (see below)
/// - All types must be the same
/// - Only works with integral types of 2 bytes
/// - DPX instructions provide Min and Max SIMD operations
/// If the number of instructions is the same, we favor the compiler

template <int LENGTH, typename T, typename ReductionOp, typename PrefixT = T, typename AccumT = T>
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off
constexpr bool enable_dpx_reduction()
{
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(return (LENGTH == 6 || LENGTH == 8 || LENGTH >= 10) && detail::are_same<T, PrefixT, AccumT>()
&& detail::is_one_of<T, int16_t, uint16_t>() && detail::is_one_of<ReductionOp, cub::Min, cub::Max>();),
(return false;));
}
// clang-format on

// Considering compiler vectorization with 3-way reduction, the number of SASS instructions is
// standard: ceil((L - 3) / 2) + 1
// replacing L with L/2 for SIMD
// DPX: ceil((L/2 - 3) / 2) + 1 + 1 [for halfword comparison] + L % 2 [for last element]
//
// LENGTH | Standard | DPX
// 2 | 1 | NA
// 3 | 1 | NA
// 4 | 2 | 3
// 5 | 2 | 4
// 6 | 3 | 2 // *** (3-way comparison for DPX)
// 7 | 3 | 3
// 8 | 4 | 3 // ***
// 9 | 4 | 4
// 10 | 5 | 3 // ***
// 11 | 5 | 4 // ***
// 12 | 6 | 4 // ***
// 13 | 6 | 5 // ***
// 14 | 7 | 4 // ***
// 15 | 7 | 5 // ***
// 16 | 8 | 5 // ***

//----------------------------------------------------------------------------------------------------------------------

fbusato marked this conversation as resolved.
Show resolved Hide resolved
/**
* @brief Sequential reduction over statically-sized array types
*
Expand All @@ -69,23 +127,32 @@ template <int LENGTH,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
_CCCL_DEVICE
_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<!enable_dpx_reduction<LENGTH, T, ReductionOp, PrefixT, AccumT>(), AccumT>
ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
fbusato marked this conversation as resolved.
Show resolved Hide resolved
{
AccumT retval = prefix;

#pragma unroll
for (int i = 0; i < LENGTH; ++i)
{
retval = reduction_op(retval, input[i]);
}

return retval;
}

//----------------------------------------------------------------------------------------------------------------------

fbusato marked this conversation as resolved.
Show resolved Hide resolved
/// Specialization for single-element arrays
template <int LENGTH, typename T, typename ReductionOp>
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::__enable_if_t<LENGTH == 1, T>
ThreadReduce(T* input, ReductionOp reduction_op)
{
return input[0];
}

/**
* @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
* seeded with the specified @p prefix. The aggregate is returned.
* @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
* The aggregate is returned.
*
* @tparam LENGTH
* LengthT of input array
Expand All @@ -102,23 +169,48 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH
*
* @param[in] reduction_op
* Binary reduction operator
*
* @param[in] prefix
* Prefix to seed reduction with
*/
template <int LENGTH, typename T, typename ReductionOp>
_CCCL_DEVICE
_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<(!enable_dpx_reduction<LENGTH, T, ReductionOp>() && LENGTH > 1), T>
ThreadReduce(T* input, ReductionOp reduction_op)
{
T prefix = input[0];
return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>{});
}

/// Specialization for DPX reduction
template <int LENGTH, typename T, typename ReductionOp>
_CCCL_NODISCARD _CCCL_DEVICE
_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<enable_dpx_reduction<LENGTH, T, ReductionOp>(), T>
ThreadReduce(T* input, ReductionOp reduction_op)
{
constexpr auto IS_MIN = ::cuda::std::is_same<ReductionOp, cub::Min>::value;
using DpxReduceOp = ::cuda::std::_If<IS_MIN, DpxMin<T>, DpxMax<T>>;
auto unsigned_input = reinterpret_cast<unsigned*>(input);
auto simd_reduction = ThreadReduce<LENGTH / 2>(unsigned_input, DpxReduceOp{});
T simd_values[2]; // TODO (fbusato): use bit_cast
fbusato marked this conversation as resolved.
Show resolved Hide resolved
::memcpy(simd_values, &simd_reduction, sizeof(simd_values));
auto ret_value = reduction_op(simd_values[0], simd_values[1]);
return (LENGTH % 2 == 0) ? ret_value : reduction_op(ret_value, input[LENGTH - 1]);
}

/// Specialization for DPX reduction with prefix
template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
_CCCL_NODISCARD _CCCL_DEVICE
_CCCL_FORCEINLINE ::cuda::std::__enable_if_t<enable_dpx_reduction<LENGTH, T, ReductionOp, PrefixT, AccumT>(), T>
ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH>)
{
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
return reduction_op(ThreadReduce<LENGTH>(input, reduction_op), prefix);
}

/**
* @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
* The aggregate is returned.
* @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
* seeded with the specified @p prefix. The aggregate is returned.
*
* @tparam LENGTH
* LengthT of input array
Expand All @@ -135,12 +227,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reducti
*
* @param[in] reduction_op
* Binary reduction operator
*
* @param[in] prefix
* Prefix to seed reduction with
*/
template <int LENGTH, typename T, typename ReductionOp>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op)
template <int LENGTH,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
{
T prefix = input[0];
return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
}

/**
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/warp/specializations/warp_reduce_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ struct WarpReduceShfl
{
enum
{
/// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per
/// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per
/// exchange
IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
};
Expand Down
15 changes: 11 additions & 4 deletions cub/test/catch2_test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#include "insert_nested_NVTX_range_guard.h"
// above header needs to be included first

Expand All @@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max);
DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);

// %PARAM% TEST_LAUNCH lid 0:1:2
// %PARAM% TEST_TYPES types 0:1:2:3
// %PARAM% TEST_TYPES types 0:1:2:3:4

// List of types to test
using custom_t =
Expand All @@ -72,9 +71,13 @@ type_pair<custom_t>
#endif
#if TEST_BF_T
, type_pair<bfloat16_t> // testing bf16
#endif

>;
#endif
// clang-format on
#elif TEST_TYPES == 4
// DPX SIMD instructions
using full_type_list = c2h::type_list<type_pair<std::uint16_t>, type_pair<std::int16_t>>;
#endif

/**
Expand Down Expand Up @@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
}
auto d_in_it = thrust::raw_pointer_cast(in_items.data());

#if TEST_TYPES != 4
SECTION("reduce")
{
using op_t = cub::Sum;
Expand All @@ -145,10 +149,11 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
// Verify result
REQUIRE(expected_result == out_result[0]);
}
#endif // TEST_TYPES != 4

// Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
// to pseudo associativity of the addition operation over floating point numbers
#if TEST_TYPES != 3
#if TEST_TYPES != 3 && TEST_TYPES != 4
SECTION("sum")
{
using op_t = cub::Sum;
Expand Down Expand Up @@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
REQUIRE(expected_result == out_result[0]);
}

#if TEST_TYPES != 4
SECTION("argmax")
{
// Prepare verification data
Expand Down Expand Up @@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
REQUIRE(expected_result[0] == gpu_value);
REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
}
#endif
}
Loading
Loading