diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index a9dbc9efc7..3194fe11e2 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -302,6 +302,9 @@ struct AgentUniqueByKey CTA_SYNC(); + // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset + // types + #pragma unroll(1) for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) @@ -626,7 +629,9 @@ struct AgentUniqueByKey { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile + + // Global offset for the current tile + OffsetT tile_offset = static_cast(tile_idx) * static_cast(ITEMS_PER_TILE); if (tile_idx < num_tiles - 1) { diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 36c0576707..1292f04ad1 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -42,13 +42,14 @@ # pragma system_header #endif // no system header -#include -#include - +#include #include #include #include +#include +#include + CUB_NAMESPACE_BEGIN @@ -837,6 +838,165 @@ struct DeviceSelect stream); } + //! @rst + //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive + //! equal-valued keys, only the first key and its value from each run is selectively copied + //! to ``d_keys_out`` and ``d_values_out``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent + //! - Copies of the selected items are compacted into ``d_out`` and maintain + //! their original relative ordering. + //! - In-place operations are not supported. There must be no overlap between + //! any of the provided ranges: + //! + //! - ``[d_keys_in, d_keys_in + num_items)`` + //! - ``[d_keys_out, d_keys_out + *d_num_selected_out)`` + //! - ``[d_values_in, d_values_in + num_items)`` + //! - ``[d_values_out, d_values_out + *d_num_selected_out)`` + //! - ``[d_num_selected_out, d_num_selected_out + 1)`` + //! + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + //! int *d_keys_out; // e.g., [ , , , , , , , ] + //! int *d_values_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::UniqueByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, + //! d_keys_out, d_values_out, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::UniqueByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, + //! d_keys_out, d_values_out, d_num_selected_out, num_items); + //! + //! // d_keys_out <-- [0, 2, 9, 5, 8] + //! // d_values_out <-- [1, 2, 4, 5, 8] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected keys @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected values @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @tparam EqualityOpT + //! **[inferred]** Type of equality_op + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Pointer to the input sequence of keys + //! + //! @param[in] d_values_in + //! Pointer to the input sequence of values + //! + //! @param[out] d_keys_out + //! Pointer to the output sequence of selected keys + //! + //! @param[out] d_values_out + //! Pointer to the output sequence of selected values + //! + //! @param[out] d_num_selected_out + //! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`) + //! + //! @param[in] equality_op + //! Binary predicate to determine equality + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst + template + CUB_RUNTIME_FUNCTION __forceinline__ static // + typename ::cuda::std::enable_if< // + !::cuda::std::is_convertible::value, // + cudaError_t>::type + UniqueByKey( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + NumItemsT num_items, + EqualityOpT equality_op, + cudaStream_t stream = 0) + { + using OffsetT = typename detail::ChooseOffsetT::Type; + + return DispatchUniqueByKey< + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + NumSelectedIteratorT, + EqualityOpT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + equality_op, + static_cast(num_items), + stream); + } + //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive //! equal-valued keys, only the first key and its value from each run is selectively copied @@ -914,6 +1074,9 @@ struct DeviceSelect //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -947,56 +1110,49 @@ struct DeviceSelect typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - UniqueByKey(void *d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - int num_items, - cudaStream_t stream = 0) + typename NumSelectedIteratorT, + typename NumItemsT> + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + NumItemsT num_items, + cudaStream_t stream = 0) { - using OffsetT = int; - using EqualityOp = Equality; - - return DispatchUniqueByKey::Dispatch(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_keys_out, - d_values_out, - d_num_selected_out, - EqualityOp(), - num_items, - stream); + return UniqueByKey( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + num_items, + Equality{}, + stream); } template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - UniqueByKey(void *d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) + typename NumSelectedIteratorT, + typename NumItemsT> + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + NumItemsT num_items, + cudaStream_t stream, + bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG @@ -1004,15 +1160,17 @@ struct DeviceSelect ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, - NumSelectedIteratorT>(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_keys_out, - d_values_out, - d_num_selected_out, - num_items, - stream); + NumSelectedIteratorT, + NumItemsT>( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + num_items, + stream); } }; diff --git a/cub/test/catch2_test_device_select_unique_by_key.cu b/cub/test/catch2_test_device_select_unique_by_key.cu index d829b9dc12..36dc0ca46f 100644 --- a/cub/test/catch2_test_device_select_unique_by_key.cu +++ b/cub/test/catch2_test_device_select_unique_by_key.cu @@ -81,6 +81,23 @@ struct index_to_huge_type_op_t } }; +template +struct index_to_value_t +{ + template + __host__ __device__ __forceinline__ ValueT operator()(IndexT index) + { + if (static_cast(index) == 4300000000ULL) + { + return static_cast(1); + } + else + { + return static_cast(0); + } + } +}; + DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::UniqueByKey, select_unique_by_key); // %PARAM% TEST_LAUNCH lid 0:1:2 @@ -215,15 +232,27 @@ CUB_TEST("DeviceSelect::UniqueByKey does not change input", "[device][select_uni REQUIRE(reference_vals == vals_in); } +template struct project_first { + EqualityOpT equality_op; template __host__ __device__ bool operator()(const Tuple& lhs, const Tuple& rhs) const { - return thrust::get<0>(lhs) == thrust::get<0>(rhs); + return equality_op(thrust::get<0>(lhs), thrust::get<0>(rhs)); } }; +template +struct custom_equality_op +{ + T div_val; + __host__ __device__ __forceinline__ bool operator()(const T& lhs, const T& rhs) const + { + return (lhs / div_val) == (rhs / div_val); + } +}; + CUB_TEST("DeviceSelect::UniqueByKey works with iterators", "[device][select_unique_by_key]", all_types) { using type = typename c2h::get<0, TestType>; @@ -253,7 +282,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with iterators", "[device][select_uniq thrust::host_vector reference_vals = vals_in; const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin()); const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end()); - const auto boundary = std::unique(zip_begin, zip_end, project_first{}); + const auto boundary = std::unique(zip_begin, zip_end, project_first{cub::Equality{}}); REQUIRE((boundary - zip_begin) == num_selected_out[0]); keys_out.resize(num_selected_out[0]); @@ -293,7 +322,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with pointers", "[device][select_uniqu thrust::host_vector reference_vals = vals_in; const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin()); const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end()); - const auto boundary = std::unique(zip_begin, zip_end, project_first{}); + const auto boundary = std::unique(zip_begin, zip_end, project_first{cub::Equality{}}); REQUIRE((boundary - zip_begin) == num_selected_out[0]); keys_out.resize(num_selected_out[0]); @@ -346,7 +375,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works with a different output type", "[devic thrust::host_vector reference_vals = vals_in; const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin()); const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end()); - const auto boundary = std::unique(zip_begin, zip_end, project_first{}); + const auto boundary = std::unique(zip_begin, zip_end, project_first{cub::Equality{}}); REQUIRE((boundary - zip_begin) == num_selected_out[0]); keys_out.resize(num_selected_out[0]); @@ -392,7 +421,7 @@ CUB_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types", const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin()); const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end()); - const auto boundary = std::unique(zip_begin, zip_end, project_first{}); + const auto boundary = std::unique(zip_begin, zip_end, project_first{cub::Equality{}}); REQUIRE((boundary - zip_begin) == num_selected_out[0]); keys_out.resize(num_selected_out[0]); @@ -402,3 +431,105 @@ CUB_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types", REQUIRE(reference_keys == keys_out); REQUIRE(reference_vals == vals_out); } + +CUB_TEST("DeviceSelect::UniqueByKey works for very large input that need 64-bit offset types", + "[device][select_unique_by_key]") +{ + using type = std::int32_t; + using index_type = std::int64_t; + + const std::size_t num_items = 4400000000ULL; + thrust::host_vector reference_keys{static_cast(0), static_cast(1), static_cast(0)}; + thrust::host_vector reference_values{0, 4300000000ULL, 4300000001ULL}; + + auto keys_in = thrust::make_transform_iterator(thrust::make_counting_iterator(0ULL), index_to_value_t{}); + auto values_in = thrust::make_counting_iterator(0ULL); + thrust::device_vector keys_out(reference_keys.size()); + thrust::device_vector values_out(reference_values.size()); + + // Needs to be device accessible + thrust::device_vector num_selected_out(1, 0); + int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + select_unique_by_key(keys_in, values_in, keys_out.begin(), values_out.begin(), d_first_num_selected_out, num_items); + + // Ensure that we created the correct output + REQUIRE(reference_keys.size() == static_cast(num_selected_out[0])); + REQUIRE(reference_keys == keys_out); + REQUIRE(reference_values == values_out); +} + +CUB_TEST("DeviceSelect::UniqueByKey works for very large outputs that needs 64-bit offset types", + "[device][select_unique_by_key]") +{ + using type = std::int32_t; + using index_type = std::int64_t; + + constexpr std::size_t num_items = 4400000000ULL; + + auto keys_in = thrust::make_counting_iterator(0ULL); + auto values_in = thrust::make_counting_iterator(0ULL); + + // Needs to be device accessible + thrust::device_vector num_selected_out(1, 0); + index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + select_unique_by_key( + keys_in, + values_in, + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + d_first_num_selected_out, + num_items); + + // Ensure that we created the correct output + REQUIRE(num_items == static_cast(num_selected_out[0])); +} + +CUB_TEST("DeviceSelect::UniqueByKey works with a custom equality operator", + "[device][select_unique_by_key]") +{ + using type = std::int32_t; + using custom_op_t = custom_equality_op; + using val_type = std::uint64_t; + using index_type = std::int64_t; + + const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); + auto keys_in = thrust::make_counting_iterator(static_cast(0)); + auto values_in = thrust::make_counting_iterator(0ULL); + thrust::device_vector keys_out(num_items); + thrust::device_vector vals_out(num_items); + + // Needs to be device accessible + thrust::device_vector num_selected_out(1, 0); + index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + select_unique_by_key( + keys_in, + values_in, + keys_out.begin(), + vals_out.begin(), + d_first_num_selected_out, + num_items, + custom_op_t{static_cast(8)}); + + // Ensure that we create the same output as std + thrust::host_vector reference_keys(num_items); + thrust::host_vector reference_vals(num_items); + thrust::copy(keys_in, keys_in + num_items, reference_keys.begin()); + thrust::copy(values_in, values_in + num_items, reference_vals.begin()); + const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin()); + const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end()); + const auto boundary = std::unique(zip_begin, zip_end, project_first{custom_op_t{static_cast(8)}}); + REQUIRE((boundary - zip_begin) == static_cast(num_selected_out[0])); + + keys_out.resize(num_selected_out[0]); + vals_out.resize(num_selected_out[0]); + reference_keys.resize(num_selected_out[0]); + reference_vals.resize(num_selected_out[0]); + REQUIRE(reference_keys == keys_out); + REQUIRE(reference_vals == vals_out); +} \ No newline at end of file diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 10fb460117..00a4412c86 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -1509,6 +1509,18 @@ struct HugeDataType } } + __device__ __host__ HugeDataType& operator=(const HugeDataType& rhs) + { + if (this != &rhs) + { + for (int i = 0; i < ELEMENTS_PER_OBJECT; i++) + { + data[i] = rhs.data[i]; + } + } + return *this; + } + int data[ELEMENTS_PER_OBJECT]; }; @@ -1555,3 +1567,22 @@ __device__ __host__ bool operator!=(const HugeDataType& lhs return false; } + + +template +std::ostream& +operator<<(std::ostream& os, +const HugeDataType& val) +{ + os << '('; + for (int i = 0; i < ELEMENTS_PER_OBJECT; i++) + { + os << CoutCast(val.data[i]); + if (i < ELEMENTS_PER_OBJECT - 1) + { + os << ','; + } + } + os << ')'; + return os; +} diff --git a/thrust/testing/unique_by_key.cu b/thrust/testing/unique_by_key.cu index 76073e0ca3..e6e0dd0af6 100644 --- a/thrust/testing/unique_by_key.cu +++ b/thrust/testing/unique_by_key.cu @@ -4,6 +4,22 @@ #include #include +template +struct index_to_value_t +{ + template + __host__ __device__ __forceinline__ ValueT operator()(IndexT index) + { + if (static_cast(index) == 4300000000ULL) + { + return static_cast(1); + } + else + { + return static_cast(0); + } + } +}; template @@ -337,6 +353,8 @@ struct TestUniqueCopyByKey }; VariableUnitTest TestUniqueCopyByKeyInstance; + + template struct TestUniqueCopyByKeyToDiscardIterator { @@ -436,3 +454,62 @@ struct TestUniqueCopyByKeyToDiscardIterator }; VariableUnitTest TestUniqueCopyByKeyToDiscardIteratorInstance; +template +struct TestUniqueCopyByKeyLargeInput +{ + void operator()() + { + using type = K; + using index_type = std::int64_t; + + const std::size_t num_items = 4400000000ULL; + thrust::host_vector reference_keys{static_cast(0), static_cast(1), static_cast(0)}; + thrust::host_vector reference_values{0, 4300000000ULL, 4300000001ULL}; + + auto keys_in = thrust::make_transform_iterator(thrust::make_counting_iterator(0ULL), index_to_value_t{}); + auto values_in = thrust::make_counting_iterator(0ULL); + thrust::device_vector keys_out(reference_keys.size()); + thrust::device_vector values_out(reference_values.size()); + + // Run test + const auto selected_aut_end = thrust::unique_by_key_copy( + keys_in, keys_in + num_items, values_in, keys_out.begin(), values_out.begin()); + + // Ensure that we created the correct output + auto const num_selected_out = thrust::distance(keys_out.begin(), selected_aut_end.first); + ASSERT_EQUAL(reference_keys.size(), static_cast(num_selected_out)); + ASSERT_EQUAL(num_selected_out, thrust::distance(values_out.begin(), selected_aut_end.second)); + keys_out.resize(num_selected_out); + values_out.resize(num_selected_out); + ASSERT_EQUAL(reference_keys, keys_out); + ASSERT_EQUAL(reference_values, values_out); + } +}; +SimpleUnitTest TestUniqueCopyByKeyLargeInputInstance; + +template +struct TestUniqueCopyByKeyLargeOutCount +{ + void operator()() + { + using type = std::int32_t; + using index_type = std::int64_t; + + constexpr std::size_t num_items = 4400000000ULL; + + auto keys_in = thrust::make_counting_iterator(0ULL); + auto values_in = thrust::make_counting_iterator(0ULL); + + // Run test + auto keys_out = thrust::make_discard_iterator(); + auto values_out = thrust::make_discard_iterator(); + const auto selected_aut_end = thrust::unique_by_key_copy(thrust::device, + keys_in, keys_in + num_items, values_in, keys_out, values_out); + + // Ensure that we created the correct output + auto const num_selected_out = thrust::distance(keys_out, selected_aut_end.first); + ASSERT_EQUAL(num_items, static_cast(num_selected_out)); + ASSERT_EQUAL(num_selected_out, thrust::distance(values_out, selected_aut_end.second)); + } +}; +SimpleUnitTest TestUniqueCopyByKeyLargeOutCountInstance; diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h index 570d0d7139..30a1855c11 100644 --- a/thrust/thrust/system/cuda/detail/unique_by_key.h +++ b/thrust/thrust/system/cuda/detail/unique_by_key.h @@ -38,890 +38,272 @@ #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include +# include +# include + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include THRUST_NAMESPACE_BEGIN -template -_CCCL_HOST_DEVICE thrust::pair -unique_by_key( - const thrust::detail::execution_policy_base &exec, - ForwardIterator1 keys_first, - ForwardIterator1 keys_last, - ForwardIterator2 values_first); +template +_CCCL_HOST_DEVICE thrust::pair unique_by_key( + const thrust::detail::execution_policy_base& exec, + ForwardIterator1 keys_first, + ForwardIterator1 keys_last, + ForwardIterator2 values_first); template -_CCCL_HOST_DEVICE thrust::pair -unique_by_key_copy( - const thrust::detail::execution_policy_base &exec, - InputIterator1 keys_first, - InputIterator1 keys_last, - InputIterator2 values_first, - OutputIterator1 keys_result, - OutputIterator2 values_result); - - -namespace cuda_cub { - -// XXX it should be possible to unify unique & unique_by_key into a single -// agent with various specializations, similar to what is done -// with partition -namespace __unique_by_key { - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, - }; - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; - }; // struct PtxPolicy - - template - struct Tuning; - - namespace mpl = thrust::detail::mpl::math; - - template - struct items_per_thread - { - enum - { - value = mpl::min< - int, - static_cast(NOMINAL_4B_ITEMS_PER_THREAD), - mpl::max(NOMINAL_4B_ITEMS_PER_THREAD * 4 / - sizeof(T))>::value>::value - }; - }; - - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 11, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - typedef PtxPolicy<64, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_SCAN_WARP_SCANS> - type; - }; // Tuning for sm52 - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_SCAN_WARP_SCANS> - type; - }; // Tuning for sm35 - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_DEFAULT, - cub::BLOCK_SCAN_WARP_SCANS> - type; - }; // Tuning for sm30 - - template - struct UniqueByKeyAgent - { - typedef typename iterator_traits::value_type key_type; - typedef typename iterator_traits::value_type value_type; - - typedef cub::ScanTileState ScanTileState; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type KeyLoadIt; - typedef typename core::LoadIterator::type ValLoadIt; - - typedef typename core::BlockLoad::type BlockLoadKeys; - typedef typename core::BlockLoad::type BlockLoadValues; - - typedef cub::BlockDiscontinuity - BlockDiscontinuityKeys; - - typedef cub::TilePrefixCallbackOp - TilePrefixCallback; - typedef cub::BlockScan - BlockScan; - - typedef core::uninitialized_array - shared_keys_t; - typedef core::uninitialized_array - shared_values_t; - - union TempStorage - { - struct ScanStorage - { - typename BlockScan::TempStorage scan; - typename TilePrefixCallback::TempStorage prefix; - typename BlockDiscontinuityKeys::TempStorage discontinuity; - } scan_storage; - - typename BlockLoadKeys::TempStorage load_keys; - typename BlockLoadValues::TempStorage load_values; - - shared_keys_t shared_keys; - shared_values_t shared_values; - }; // union TempStorage - }; // struct PtxPlan - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::KeyLoadIt KeyLoadIt; - typedef typename ptx_plan::ValLoadIt ValLoadIt; - typedef typename ptx_plan::BlockLoadKeys BlockLoadKeys; - typedef typename ptx_plan::BlockLoadValues BlockLoadValues; - typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys; - typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback; - typedef typename ptx_plan::BlockScan BlockScan; - typedef typename ptx_plan::TempStorage TempStorage; - typedef typename ptx_plan::shared_keys_t shared_keys_t; - typedef typename ptx_plan::shared_values_t shared_values_t; - - enum - { - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE - }; - - struct impl - { - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - TempStorage & temp_storage; - ScanTileState & tile_state; - KeyLoadIt keys_in; - ValLoadIt values_in; - KeyOutputIt keys_out; - ValOutputIt values_out; - cub::InequalityWrapper predicate; - Size num_items; - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - struct key_tag {}; - struct value_tag {}; - - THRUST_DEVICE_FUNCTION - shared_keys_t &get_shared(key_tag) - { - return temp_storage.shared_keys; - } - THRUST_DEVICE_FUNCTION - shared_values_t &get_shared(value_tag) - { - return temp_storage.shared_values; - } - - - template - void THRUST_DEVICE_FUNCTION - scatter(Tag tag, - OutputIt items_out, - T (&items)[ITEMS_PER_THREAD], - Size (&selection_flags)[ITEMS_PER_THREAD], - Size (&selection_indices)[ITEMS_PER_THREAD], - int /*num_tile_items*/, - int num_tile_selections, - Size num_selections_prefix, - Size /*num_selections*/) - { - using core::sync_threadblock; - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int local_scatter_offset = selection_indices[ITEM] - - num_selections_prefix; - if (selection_flags[ITEM]) - { - get_shared(tag)[local_scatter_offset] = items[ITEM]; - } - } - - sync_threadblock(); - - for (int item = threadIdx.x; - item < num_tile_selections; - item += BLOCK_THREADS) - { - items_out[num_selections_prefix + item] = get_shared(tag)[item]; - } - - sync_threadblock(); - } - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - template - Size THRUST_DEVICE_FUNCTION - consume_tile_impl(int num_tile_items, - int tile_idx, - Size tile_base) - { - using core::sync_threadblock; - - key_type keys[ITEMS_PER_THREAD]; - Size selection_flags[ITEMS_PER_THREAD]; - Size selection_idx[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - { - // Fill last elements with the first element - // because collectives are not suffix guarded - BlockLoadKeys(temp_storage.load_keys) - .Load(keys_in + tile_base, - keys, - num_tile_items, - *(keys_in + tile_base)); - } - else - { - BlockLoadKeys(temp_storage.load_keys).Load(keys_in + tile_base, keys); - } - - - sync_threadblock(); - - value_type values[ITEMS_PER_THREAD]; - if (IS_LAST_TILE) - { - // Fill last elements with the first element - // because collectives are not suffix guarded - BlockLoadValues(temp_storage.load_values) - .Load(values_in + tile_base, - values, - num_tile_items, - *(values_in + tile_base)); - } - else - { - BlockLoadValues(temp_storage.load_values) - .Load(values_in + tile_base, values); - } - - sync_threadblock(); - - if (IS_FIRST_TILE) - { - BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) - .FlagHeads(selection_flags, keys, predicate); - } - else - { - key_type tile_predecessor = keys_in[tile_base - 1]; - BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) - .FlagHeads(selection_flags, keys, predicate, tile_predecessor); - } -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Set selection_flags for out-of-bounds items - if ((IS_LAST_TILE) && (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) - selection_flags[ITEM] = 1; - } - - sync_threadblock(); - - - Size num_tile_selections = 0; - Size num_selections = 0; - Size num_selections_prefix = 0; - if (IS_FIRST_TILE) - { - BlockScan(temp_storage.scan_storage.scan) - .ExclusiveSum(selection_flags, - selection_idx, - num_tile_selections); - - if (threadIdx.x == 0) - { - // Update tile status if this is not the last tile - if (!IS_LAST_TILE) - tile_state.SetInclusive(0, num_tile_selections); - } - - // Do not count any out-of-bounds selections - if (IS_LAST_TILE) - { - int num_discount = ITEMS_PER_TILE - num_tile_items; - num_tile_selections -= num_discount; - } - num_selections = num_tile_selections; - } - else - { - TilePrefixCallback prefix_cb(tile_state, - temp_storage.scan_storage.prefix, - cub::Sum(), - tile_idx); - BlockScan(temp_storage.scan_storage.scan) - .ExclusiveSum(selection_flags, - selection_idx, - prefix_cb); - - num_selections = prefix_cb.GetInclusivePrefix(); - num_tile_selections = prefix_cb.GetBlockAggregate(); - num_selections_prefix = prefix_cb.GetExclusivePrefix(); - - if (IS_LAST_TILE) - { - int num_discount = ITEMS_PER_TILE - num_tile_items; - num_tile_selections -= num_discount; - num_selections -= num_discount; - } - } - - sync_threadblock(); - - scatter(key_tag(), - keys_out, - keys, - selection_flags, - selection_idx, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_selections); - - sync_threadblock(); - - scatter(value_tag(), - values_out, - values, - selection_flags, - selection_idx, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_selections); - - return num_selections; - } - - - template - Size THRUST_DEVICE_FUNCTION - consume_tile(int num_tile_items, - int tile_idx, - Size tile_base) - { - if (tile_idx == 0) - { - return consume_tile_impl(num_tile_items, - tile_idx, - tile_base); - } - else - { - return consume_tile_impl(num_tile_items, - tile_idx, - tile_base); - } - } - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - THRUST_DEVICE_FUNCTION - impl(TempStorage & temp_storage_, - ScanTileState & tile_state_, - KeyLoadIt keys_in_, - ValLoadIt values_in_, - KeyOutputIt keys_out_, - ValOutputIt values_out_, - BinaryPred binary_pred_, - Size num_items_, - int num_tiles, - NumSelectedOutIt num_selected_out) - // filed ctors - : temp_storage(temp_storage_), - tile_state(tile_state_), - keys_in(keys_in_), - values_in(values_in_), - keys_out(keys_out_), - values_out(values_out_), - predicate(binary_pred_), - num_items(num_items_) - { - int tile_idx = blockIdx.x; - Size tile_base = tile_idx * ITEMS_PER_TILE; - - if (tile_idx < num_tiles - 1) - { - consume_tile(ITEMS_PER_TILE, - tile_idx, - tile_base); - } - else - { - int num_remaining = static_cast(num_items - tile_base); - Size num_selections = consume_tile(num_remaining, - tile_idx, - tile_base); - if (threadIdx.x == 0) - { - *num_selected_out = num_selections; - } - } - } - }; // struct impl - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(KeyInputIt keys_in, - ValInputIt values_in, - KeyOutputIt keys_out, - ValOutputIt values_out, - BinaryPred binary_pred, - NumSelectedOutIt num_selected_out, - Size num_items, - ScanTileState tile_state, - int num_tiles, - char * shmem) - { - TempStorage &storage = *reinterpret_cast(shmem); - - impl(storage, - tile_state, - core::make_load_iterator(ptx_plan(), keys_in), - core::make_load_iterator(ptx_plan(), values_in), - keys_out, - values_out, - binary_pred, - num_items, - num_tiles, - num_selected_out); - } - }; // struct UniqueByKeyAgent +_CCCL_HOST_DEVICE thrust::pair unique_by_key_copy( + const thrust::detail::execution_policy_base& exec, + InputIterator1 keys_first, + InputIterator1 keys_last, + InputIterator2 values_first, + OutputIterator1 keys_result, + OutputIterator2 values_result); + +namespace cuda_cub +{ +namespace detail +{ - template - struct InitAgent +template +struct DispatchUniqueByKey +{ + static cudaError_t THRUST_RUNTIME_FUNCTION dispatch( + execution_policy& policy, + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIt keys_in, + ValInputIt values_in, + KeyOutputIt keys_out, + ValOutputIt values_out, + OffsetT num_items, + BinaryPred binary_pred, + pair& result_end) { - template - struct PtxPlan : PtxPolicy<128> {}; - - typedef core::specialize_plan ptx_plan; + cudaError_t status = cudaSuccess; + cudaStream_t stream = cuda_cub::stream(policy); + size_t allocation_sizes[2] = {0, sizeof(OffsetT)}; + void* allocations[2] = {nullptr, nullptr}; + + // Query algorithm memory requirements + status = cub::DeviceSelect::UniqueByKey( + nullptr, + allocation_sizes[0], + keys_in, + values_in, + keys_out, + values_out, + static_cast(nullptr), + num_items, + stream); + CUDA_CUB_RET_IF_FAIL(status); - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + CUDA_CUB_RET_IF_FAIL(status); - THRUST_AGENT_ENTRY(ScanTileState tile_state, - Size num_tiles, - NumSelectedIt num_selected_out, - char * /*shmem*/) + // Return if we're only querying temporary storage requirements + if (d_temp_storage == nullptr) { - tile_state.InitializeStatus(num_tiles); - if (blockIdx.x == 0 && threadIdx.x == 0) - *num_selected_out = 0; + return status; } - }; // struct InitAgent - - - template - static cudaError_t THRUST_RUNTIME_FUNCTION - doit_step(void * d_temp_storage, - size_t & temp_storage_bytes, - KeyInputIt keys_in, - ValInputIt values_in, - KeyOutputIt keys_out, - ValOutputIt values_out, - BinaryPred binary_pred, - NumSelectedOutIt num_selected_out, - Size num_items, - cudaStream_t stream) - { - using core::AgentLauncher; - using core::AgentPlan; - using core::get_agent_plan; - - typedef AgentLauncher< - UniqueByKeyAgent > - unique_agent; - - typedef typename unique_agent::ScanTileState ScanTileState; - - typedef AgentLauncher< - InitAgent > - init_agent; - - using core::get_plan; - typename get_plan::type init_plan = init_agent::get_plan(); - typename get_plan::type unique_plan = unique_agent::get_plan(stream); - - - int tile_size = unique_plan.items_per_tile; - size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); - - size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, - num_tiles); - - cudaError_t status = cudaSuccess; - size_t allocation_sizes[2] = {0, vshmem_size}; - status = ScanTileState::AllocationSize(static_cast(num_tiles), allocation_sizes[0]); - CUDA_CUB_RET_IF_FAIL(status); - - void *allocations[2] = {NULL, NULL}; - // - status = cub::AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); - CUDA_CUB_RET_IF_FAIL(status); - - if (d_temp_storage == NULL) + // Return for empty problems + if (num_items == 0) { + result_end = thrust::make_pair(keys_out, values_out); return status; } - ScanTileState tile_status; - status = tile_status.Init(static_cast(num_tiles), allocations[0], allocation_sizes[0]); + // Memory allocation for the number of selected output items + OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); + + // Run algorithm + status = cub::DeviceSelect::UniqueByKey( + allocations[0], + allocation_sizes[0], + keys_in, + values_in, + keys_out, + values_out, + d_num_selected_out, + num_items, + binary_pred, + stream); CUDA_CUB_RET_IF_FAIL(status); - num_tiles = max(1,num_tiles); - init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent"); - ia.launch(tile_status, num_tiles, num_selected_out); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - if (num_items == 0) { return status; } - - char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL; - - unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent"); - ua.launch(keys_in, - values_in, - keys_out, - values_out, - binary_pred, - num_selected_out, - num_items, - tile_status, - num_tiles); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - return status; - } - - template - THRUST_RUNTIME_FUNCTION - pair - unique_by_key(execution_policy& policy, - KeyInputIt keys_first, - KeyInputIt keys_last, - ValInputIt values_first, - KeyOutputIt keys_result, - ValOutputIt values_result, - BinaryPred binary_pred) - { - - typedef int size_type; - - size_type num_items - = static_cast(thrust::distance(keys_first, keys_last)); - - size_t temp_storage_bytes = 0; - cudaStream_t stream = cuda_cub::stream(policy); - - cudaError_t status; - status = __unique_by_key::doit_step(NULL, - temp_storage_bytes, - keys_first, - values_first, - keys_result, - values_result, - binary_pred, - reinterpret_cast(NULL), - num_items, - stream); - cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step"); - - size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes}; - void * allocations[2] = {NULL, NULL}; - - size_t storage_size = 0; - status = core::alias_storage(NULL, - storage_size, - allocations, - allocation_sizes); - cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage"); - - // Allocate temporary storage. - thrust::detail::temporary_array - tmp(policy, storage_size); - void *ptr = static_cast(tmp.data().get()); - - status = core::alias_storage(ptr, - storage_size, - allocations, - allocation_sizes); - cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage"); - - size_type* d_num_selected_out - = thrust::detail::aligned_reinterpret_cast(allocations[0]); - - status = __unique_by_key::doit_step(allocations[1], - temp_storage_bytes, - keys_first, - values_first, - keys_result, - values_result, - binary_pred, - d_num_selected_out, - num_items, - stream); - cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step"); - + // Get number of selected items status = cuda_cub::synchronize(policy); - cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize"); - - size_type num_selected = get_value(policy, d_num_selected_out); + CUDA_CUB_RET_IF_FAIL(status); + OffsetT num_selected = get_value(policy, d_num_selected_out); - return thrust::make_pair( - keys_result + num_selected, - values_result + num_selected - ); + result_end = thrust::make_pair(keys_out + num_selected, values_out + num_selected); + return status; } +}; + +template +THRUST_RUNTIME_FUNCTION pair unique_by_key( + execution_policy& policy, + KeyInputIt keys_first, + KeyInputIt keys_last, + ValInputIt values_first, + KeyOutputIt keys_result, + ValOutputIt values_result, + BinaryPred binary_pred) +{ + using size_type = typename iterator_traits::difference_type; + + size_type num_items = static_cast(thrust::distance(keys_first, keys_last)); + pair result_end{}; + cudaError_t status = cudaSuccess; + size_t temp_storage_bytes = 0; + + // 32-bit offset-type dispatch + using dispatch32_t = + DispatchUniqueByKey; + + // 64-bit offset-type dispatch + using dispatch64_t = + DispatchUniqueByKey; + + // Query temporary storage requirements + THRUST_INDEX_TYPE_DISPATCH2( + status, + dispatch32_t::dispatch, + dispatch64_t::dispatch, + num_items, + (policy, + nullptr, + temp_storage_bytes, + keys_first, + values_first, + keys_result, + values_result, + num_items_fixed, + binary_pred, + result_end)); + cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step"); + + // Allocate temporary storage. + thrust::detail::temporary_array tmp(policy, temp_storage_bytes); + void* temp_storage = static_cast(tmp.data().get()); + + // Run algorithm + THRUST_INDEX_TYPE_DISPATCH2( + status, + dispatch32_t::dispatch, + dispatch64_t::dispatch, + num_items, + (policy, + temp_storage, + temp_storage_bytes, + keys_first, + values_first, + keys_result, + values_result, + num_items_fixed, + binary_pred, + result_end)); + cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step"); + + return result_end; +} -} // namespace __unique_by_key - +} // namespace detail //------------------------- // Thrust API entry points //------------------------- - - _CCCL_EXEC_CHECK_DISABLE -template -pair _CCCL_HOST_DEVICE -unique_by_key_copy(execution_policy &policy, - KeyInputIt keys_first, - KeyInputIt keys_last, - ValInputIt values_first, - KeyOutputIt keys_result, - ValOutputIt values_result, - BinaryPred binary_pred) +template +pair _CCCL_HOST_DEVICE unique_by_key_copy( + execution_policy& policy, + KeyInputIt keys_first, + KeyInputIt keys_last, + ValInputIt values_first, + KeyOutputIt keys_result, + ValOutputIt values_result, + BinaryPred binary_pred) { auto ret = thrust::make_pair(keys_result, values_result); THRUST_CDP_DISPATCH( - (ret = __unique_by_key::unique_by_key(policy, - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred);), - (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred);)); + (ret = detail::unique_by_key(policy, keys_first, keys_last, values_first, keys_result, values_result, binary_pred);), + (ret = thrust::unique_by_key_copy( + cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values_first, keys_result, values_result, binary_pred);)); return ret; } -template -pair _CCCL_HOST_DEVICE -unique_by_key_copy(execution_policy &policy, - KeyInputIt keys_first, - KeyInputIt keys_last, - ValInputIt values_first, - KeyOutputIt keys_result, - ValOutputIt values_result) +template +pair _CCCL_HOST_DEVICE unique_by_key_copy( + execution_policy& policy, + KeyInputIt keys_first, + KeyInputIt keys_last, + ValInputIt values_first, + KeyOutputIt keys_result, + ValOutputIt values_result) { typedef typename iterator_traits::value_type key_type; - return cuda_cub::unique_by_key_copy(policy, - keys_first, - keys_last, - values_first, - keys_result, - values_result, - equal_to()); + return cuda_cub::unique_by_key_copy( + policy, keys_first, keys_last, values_first, keys_result, values_result, equal_to()); } -template -pair _CCCL_HOST_DEVICE -unique_by_key(execution_policy &policy, - KeyInputIt keys_first, - KeyInputIt keys_last, - ValInputIt values_first, - BinaryPred binary_pred) +template +pair _CCCL_HOST_DEVICE unique_by_key( + execution_policy& policy, + KeyInputIt keys_first, + KeyInputIt keys_last, + ValInputIt values_first, + BinaryPred binary_pred) { auto ret = thrust::make_pair(keys_first, values_first); THRUST_CDP_DISPATCH( - (ret = cuda_cub::unique_by_key_copy(policy, - keys_first, - keys_last, - values_first, - keys_first, - values_first, - binary_pred);), - (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - binary_pred);)); + (ret = cuda_cub::unique_by_key_copy( + policy, keys_first, keys_last, values_first, keys_first, values_first, binary_pred);), + (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values_first, binary_pred);)); return ret; } -template +template pair _CCCL_HOST_DEVICE -unique_by_key(execution_policy &policy, - KeyInputIt keys_first, - KeyInputIt keys_last, - ValInputIt values_first) +unique_by_key(execution_policy& policy, KeyInputIt keys_first, KeyInputIt keys_last, ValInputIt values_first) { typedef typename iterator_traits::value_type key_type; - return cuda_cub::unique_by_key(policy, - keys_first, - keys_last, - values_first, - equal_to()); + return cuda_cub::unique_by_key(policy, keys_first, keys_last, values_first, equal_to()); } - - -} // namespace cuda_cub +} // namespace cuda_cub THRUST_NAMESPACE_END -#include -#include +# include +# include #endif