diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu new file mode 100644 index 0000000000..a6c149ffdd --- /dev/null +++ b/cub/benchmarks/bench/reduce/min.cu @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures. +// It specifically uses cub::Min instead of a user-defined operator. +#define TUNE_T int16_t +#include + +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 +// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 + +using op_t = cub::Min; +#include "base.cuh" diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index ed505bb1fc..12dce69c13 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -50,6 +50,8 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH _CCCL_SUPPRESS_DEPRECATED_POP #include +#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr + CUB_NAMESPACE_BEGIN namespace detail { @@ -62,5 +64,101 @@ using invoke_result_t = ::cuda::std::invoke_result_t; #endif +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same() +{ + return ::cuda::std::conjunction<::cuda::std::is_same...>::value; +} + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of() +{ + return ::cuda::std::disjunction<::cuda::std::is_same...>::value; +} + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false() +{ + return false; +} + +template +struct has_binary_call_operator : ::cuda::std::false_type +{}; + +template +struct has_binary_call_operator< + T, + V, + ::cuda::std::void_t()(::cuda::std::declval(), ::cuda::std::declval()))>> + : ::cuda::std::true_type +{}; + +/*********************************************************************************************************************** + * Array like type traits + **********************************************************************************************************************/ + +template +struct has_subscript : ::cuda::std::false_type +{}; + +template +struct has_subscript()[0])>> : ::cuda::std::true_type +{}; + +template +using has_subscript_t = typename has_subscript::type; + +template +struct has_size : ::cuda::std::false_type +{}; + +// TODO: use ::cuda::std::size(::cuda::std::declval()) when std::size will be available in libcu++ +template +struct has_size().size())>> : ::cuda::std::true_type +{}; + +template +struct has_size : ::cuda::std::true_type +{}; + +template +using has_size_t = typename has_size::type; + +/*********************************************************************************************************************** + * StaticSize: a type trait that returns the number of elements in an Array-like type + **********************************************************************************************************************/ +// StaticSize is useful where size(obj) cannot be checked at compile time +// e.g. +// using Array = NonTriviallyConstructible[8]; +// std::size(Array{}) // compile error +// static_size() // ok + +template +struct StaticSize +{ + static_assert(detail::always_false(), "StaticSize not supported for this type"); +}; + +template +struct StaticSize().size()>{})>> +{ + static_assert(::cuda::std::is_trivially_constructible::value, "T must be trivially constructible"); + static constexpr auto value = T{}.size(); +}; + +template +struct StaticSize +{ + static constexpr auto value = N; +}; + +template +_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size() +{ + return StaticSize::value; +} + } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 21ed8592d6..4df4b49ac0 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -47,14 +47,15 @@ # pragma system_header #endif // no system header +#include // always_false #include #include -_CCCL_SUPPRESS_DEPRECATED_PUSH -#include -_CCCL_SUPPRESS_DEPRECATED_POP -#include -#include +#include // cuda::std::plus +#include // cuda::std::common_type +#include // cuda::std::forward + +// #include // std::plus CUB_NAMESPACE_BEGIN @@ -413,4 +414,121 @@ _CCCL_HOST_DEVICE BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) return BinaryFlip(binary_op); } +namespace internal +{ +// TODO: Remove DPX specilization when nvbug 4823237 is fixed + +template +struct DpxMin +{ + static_assert(detail::always_false(), "DpxMin is not supported for this type"); +}; + +template <> +struct DpxMin<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmins2(a, b); + } +}; + +template <> +struct DpxMin<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vminu2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct DpxMax +{ + static_assert(detail::always_false(), "DpxMax is not supported for this type"); +}; + +template <> +struct DpxMax<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmaxs2(a, b); + } +}; + +template <> +struct DpxMax<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vmaxu2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct DpxSum +{ + static_assert(detail::always_false(), "DpxSum is not supported for this type"); +}; + +template <> +struct DpxSum<::cuda::std::int16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vadd2(a, b); + } +}; + +template <> +struct DpxSum<::cuda::std::uint16_t> +{ + _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const + { + return __vadd2(a, b); + } +}; + +//---------------------------------------------------------------------------------------------------------------------- + +template +struct CubOperatorToDpx +{ + static_assert(detail::always_false(), "Dpx is not supported for this operator"); +}; + +template +struct CubOperatorToDpx +{ + using type = DpxMin; +}; + +template +struct CubOperatorToDpx +{ + using type = DpxMax; +}; + +template +struct CubOperatorToDpx +{ + using type = DpxSum; +}; + +// template +// struct CubOperatorToDpx, T> +//{ +// using type = DpxSum; +// }; + +template +using cub_operator_to_dpx_t = CubOperatorToDpx; + +} // namespace internal + CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 7ac9836925..a956321f78 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -28,7 +28,7 @@ /** * @file - * Thread utilities for sequential reduction over statically-sized array types + * Thread reduction over statically-sized array-like types */ #pragma once @@ -43,8 +43,17 @@ # pragma system_header #endif // no system header -#include -#include +#include // are_same() +#include // cub_operator_to_dpx_t +#include +#include + +#include // bit_cast +#include // uint16_t +#include // cuda::std::plus +#include // pair + +// #include // std::plus CUB_NAMESPACE_BEGIN @@ -52,51 +61,143 @@ CUB_NAMESPACE_BEGIN namespace internal { -/** - * @brief Sequential reduction over statically-sized array types - * - * @param[in] input - * Input array - * - * @param[in] reduction_op - * Binary reduction operator - * - * @param[in] prefix - * Prefix to seed reduction with - */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT -ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/// DPX instructions compute min, max, and sum for up to three 16 and 32-bit signed or unsigned integer parameters +/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx +/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands +/// However, all other cases with per-halfword comparison need to be explicitly vectorized +/// TODO: Remove DPX specilization when nvbug 4823237 is fixed +/// +/// DPX reduction is enabled if the following conditions are met: +/// - Hopper+ architectures. DPX instructions are emulated before Hopper +/// - The number of elements must be large enough for performance reasons (see below) +/// - All types must be the same +/// - Only works with integral types of 2 bytes +/// - DPX instructions provide Min, Max, and Sum SIMD operations +/// If the number of instructions is the same, we favor the compiler + +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off +constexpr bool enable_dpx_reduction() { - AccumT retval = prefix; + using T = decltype(::cuda::std::declval()[0]); + // TODO: use constexpr variable in C++14+ + using Lenght = ::cuda::std::integral_constant()>; + return ((Lenght{} >= 9 && detail::are_same*/>()) || Lenght{} >= 10) + && detail::are_same() + && detail::is_one_of() + && detail::is_one_of*/>(); +} +// clang-format on -#pragma unroll - for (int i = 0; i < LENGTH; ++i) +// Considering compiler vectorization with 3-way comparison, the number of SASS instructions is +// Standard: ceil((L - 3) / 2) + 1 +// replacing L with L/2 for SIMD +// DPX: ceil((L/2 - 3) / 2) + 1 + 2 [for halfword comparison: PRMT, VIMNMX] + L % 2 [for last element] +// finally, the last two comparision operations are vectorized in a 3-way reduction +// ceil((L/2 - 3) / 2) + 3 +// +// length | Standard | DPX +// 2 | 1 | NA +// 3 | 1 | NA +// 4 | 2 | 3 +// 5 | 2 | 3 +// 6 | 3 | 3 +// 7 | 3 | 3 +// 8 | 4 | 4 +// 9 | 4 | 4 +// 10 | 5 | 4 // *** +// 11 | 5 | 4 // *** +// 12 | 6 | 5 // *** +// 13 | 6 | 5 // *** +// 14 | 7 | 5 // *** +// 15 | 7 | 5 // *** +// 16 | 8 | 6 // *** + +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduceSequential(const Input& input, ReductionOp reduction_op) +{ + AccumT retval = input[0]; +# pragma unroll + for (int i = 1; i < detail::static_size(); ++i) { retval = reduction_op(retval, input[i]); } - return retval; } +/// Specialization for DPX reduction +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto +ThreadReduceDpx(const Input& input, ReductionOp reduction_op) -> ::cuda::std::__remove_cvref_t +{ + using T = ::cuda::std::__remove_cvref_t; + constexpr int length = detail::static_size(); + T array[length]; +# pragma unroll + for (int i = 0; i < length; ++i) + { + array[i] = input[i]; + } + using DpxReduceOp = cub_operator_to_dpx_t; + using SimdType = ::cuda::std::pair; + auto unsigned_input = reinterpret_cast(array); + auto simd_reduction = ThreadReduceSequential(unsigned_input, DpxReduceOp{}); + auto simd_values = ::cuda::std::bit_cast(simd_reduction); + auto ret_value = reduction_op(simd_values.first, simd_values.second); + return (length % 2 == 0) ? ret_value : reduction_op(ret_value, input[length - 1]); +} + +// DPX/Sequential dispatch +template ()[0])>, + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(enable_dpx_reduction())> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op) +{ + static_assert(sizeof(Input) != sizeof(Input), "a"); + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + NV_IF_TARGET(NV_PROVIDES_SM_90, + (return ThreadReduceDpx(input, reduction_op);), + (return ThreadReduceSequential(input, reduction_op);)) +} + +template ()[0])>, + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(!enable_dpx_reduction())> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op) +{ + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + return ThreadReduceSequential(input, reduction_op); +} + +#endif // !DOXYGEN_SHOULD_SKIP_THIS + /** - * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array, - * seeded with the specified @p prefix. The aggregate is returned. + * @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix. * - * @tparam LENGTH - * LengthT of input array - * - * @tparam T - * [inferred] The data type to be reduced. + * @tparam Input + * [inferred] The data type to be reduced having member + * operator[](int i) and must be statically-sized (size() method or static array) * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * + * @tparam PrefixT + * [inferred] The prefix type + * * @param[in] input * Input array * @@ -105,101 +206,122 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Typecuda::std::__accumulator_t */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix) +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + typename ValueT = ::cuda::std::__remove_cvref_t()[0])>, +#endif // !DOXYGEN_SHOULD_SKIP_THIS + typename AccumT = ::cuda::std::__accumulator_t> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix) { - return ThreadReduce(input, reduction_op, prefix, Int2Type()); + static_assert(detail::has_subscript::value, "Input must support the subscript operator[]"); + static_assert(detail::has_size::value, "Input must have the size() method"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); + constexpr int length = detail::static_size(); + // copy to a temporary array of type AccumT + AccumT array[length + 1]; + array[0] = prefix; +#pragma unroll + for (int i = 0; i < length; ++i) + { + array[i + 1] = input[i]; + } + return ThreadReduce(array, reduction_op); } +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + /** - * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array. - * The aggregate is returned. + * @remark The pointer interface adds little value and requires Length to be explicit. + * Prefer using the array-like interface * - * @tparam LENGTH - * LengthT of input array + * @brief Perform a sequential reduction over @p length elements of the @p input pointer. The aggregate is returned. * * @tparam T - * [inferred] The data type to be reduced. + * [inferred] The data type to be reduced * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input - * Input array + * Input pointer * * @param[in] reduction_op * Binary reduction operator + * + * @return Aggregate of type cuda::std::__accumulator_t */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op) +template > +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* input, ReductionOp reduction_op) { - T prefix = input[0]; - return ThreadReduce(input + 1, reduction_op, prefix); + static_assert(Length > 0, "Length must be greater than 0"); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(V1, V2)"); + using ArrayT = T[Length]; + auto array = reinterpret_cast(input); + return ThreadReduce(*array, reduction_op); } /** - * @brief Perform a sequential reduction over the statically-sized @p input array, - * seeded with the specified @p prefix. The aggregate is returned. + * @remark The pointer interface adds little value and requires Length to be explicit. + * Prefer using the array-like interface + * + * @brief Perform a sequential reduction over @p length elements of the @p input pointer, seeded with the specified @p + * prefix. The aggregate is returned. * - * @tparam LENGTH - * [inferred] LengthT of @p input array + * @tparam length + * Length of input pointer * * @tparam T - * [inferred] The data type to be reduced. + * [inferred] The data type to be reduced * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * + * @tparam PrefixT + * [inferred] The prefix type + * * @param[in] input - * Input array + * Input pointer * * @param[in] reduction_op * Binary reduction operator * * @param[in] prefix * Prefix to seed reduction with + * + * @return Aggregate of type cuda::std::__accumulator_t */ -template > -_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) + typename AccumT = ::cuda::std::__accumulator_t, + _CUB_TEMPLATE_REQUIRES(Length > 0)> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT +ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix) { - return ThreadReduce(input, reduction_op, prefix, Int2Type()); + static_assert(detail::has_binary_call_operator::value, + "ReductionOp must have the binary call operator: operator(V1, V2)"); + auto array = reinterpret_cast(input); + return ThreadReduce(*array, reduction_op, prefix); } -/** - * @brief Serial reduction with the specified operator - * - * @tparam LENGTH - * [inferred] LengthT of @p input array - * - * @tparam T - * [inferred] The data type to be reduced. - * - * @tparam ReductionOp - * [inferred] Binary reduction operator type having member - * T operator()(const T &a, const T &b) - * - * @param[in] input - * Input array - * - * @param[in] reduction_op - * Binary reduction operator - */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op) +template +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix) { - return ThreadReduce((T*) input, reduction_op); + return prefix; } +#endif // !DOXYGEN_SHOULD_SKIP_THIS + } // namespace internal CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 41b23e6dff..fdd4083c37 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -127,7 +127,7 @@ struct WarpReduceShfl { enum { - /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per + /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per /// exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 290e8d8f6a..bfd7c3e8a2 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -24,7 +24,6 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first @@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max); // %PARAM% TEST_LAUNCH lid 0:1:2 -// %PARAM% TEST_TYPES types 0:1:2:3 +// %PARAM% TEST_TYPES types 0:1:2:3:4 // List of types to test using custom_t = @@ -72,9 +71,13 @@ type_pair #endif #if TEST_BF_T , type_pair // testing bf16 -#endif + >; +#endif // clang-format on +#elif TEST_TYPES == 4 +// DPX SIMD instructions +using full_type_list = c2h::type_list, type_pair>; #endif /** @@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f } auto d_in_it = thrust::raw_pointer_cast(in_items.data()); +#if TEST_TYPES != 4 SECTION("reduce") { using op_t = cub::Sum; @@ -145,6 +149,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Verify result REQUIRE(expected_result == out_result[0]); } +#endif // TEST_TYPES != 4 // Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due // to pseudo associativity of the addition operation over floating point numbers @@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f REQUIRE(expected_result == out_result[0]); } +#if TEST_TYPES != 4 SECTION("argmax") { // Prepare verification data @@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f REQUIRE(expected_result[0] == gpu_value); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key); } +#endif } diff --git a/docs/repo.toml b/docs/repo.toml index 5ef5eed3b7..f4c7fa4d77 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -145,7 +145,9 @@ doxygen_predefined = [ "_CCCL_DEVICE", "_CCCL_HOST_DEVICE", "_CCCL_FORCEINLINE", + "_CUB_TEMPLATE_REQUIRES(x)", "_CCCL_STD_VER", + "_CCCL_NODISCARD", "_CCCL_VISIBILITY_HIDDEN", "_CCCL_SUPPRESS_DEPRECATED_PUSH", "_CCCL_SUPPRESS_DEPRECATED_POP",