diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu new file mode 100644 index 0000000000..9d00d89e14 --- /dev/null +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -0,0 +1,135 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + +#include + +#include + +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" + +// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed +// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items, + cudaStream_t stream = 0) +{ + using init_value_t = cub::detail::InputValue; + init_value_t init_value_wrapper{init_value}; + + return cub::DispatchScan::Dispatch( + d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); +} + +DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; + +template +struct expected_sum_op +{ + uint64_t segment_size; + + __host__ __device__ __forceinline__ ItemT operator()(const uint64_t index) const + { + uint64_t sum_per_full_segment = (segment_size * (segment_size - 1)) / 2; + uint64_t full_segments = index / segment_size; + uint64_t index_within_segment = index % segment_size; + + uint64_t sum_within_partial_segment = (index_within_segment * (index_within_segment - 1)) / 2; + uint64_t sum_over_full_segments = full_segments * sum_per_full_segment; + return static_cast(sum_within_partial_segment + sum_over_full_segments); + } +}; + +template +struct mod_op +{ + uint64_t segment_size; + + __host__ __device__ __forceinline__ uint64_t operator()(const uint64_t index) const + { + return static_cast(index % segment_size); + } +}; + +CUB_TEST("DeviceScan works for very large number of items", "[scan][device]", offset_types) +try +{ + using op_t = cub::Sum; + using item_t = std::uint32_t; + using index_t = std::uint64_t; + using offset_t = typename c2h::get<0, TestType>; + + // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed + offset_t num_items = + GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), + take(2, random(num_items_min, num_items_max))); + + // Prepare input + constexpr index_t segment_size = 1000; + auto index_it = thrust::make_counting_iterator(index_t{}); + auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); + + // Output memory allocation + c2h::device_vector d_items_out(num_items); + auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + + // Run test + dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + + // Ensure that we created the correct output + auto expected_out_it = + thrust::make_transform_iterator(index_it, expected_sum_op{static_cast(segment_size)}); + bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it); + REQUIRE(all_results_correct == true); +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +}