From c97f2e3588403308ce0e6dd1ed7dbb863eeaa025 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 6 Nov 2024 13:47:15 +0100 Subject: [PATCH] Make `thrust::transform` use `cub::DeviceTransform` (#2389) * Add transform benchmark requiring a stable address * Make thrust::transform use cub::DeviceTransform * Introduces address stability detection and opt-in in libcu++ * Mark lambdas in Thrust BabelStream benchmark address oblivious * Optimize prefetch cub::DeviceTransform for small problems Fixes: #2263 --- .../device/dispatch/dispatch_transform.cuh | 7 +- docs/libcudacxx/extended_api/functional.rst | 4 + .../cuda/__functional/address_stability.h | 65 +++++++++++ libcudacxx/include/cuda/functional | 1 + thrust/benchmarks/bench/transform/basic.cu | 72 +++++++++--- thrust/testing/address_stability.cu | 24 ++++ thrust/testing/cuda/transform.cu | 77 +++++++++++++ .../detail/internal/copy_device_to_device.h | 4 + thrust/thrust/system/cuda/detail/transform.h | 104 +++++++++++++++--- 9 files changed, 326 insertions(+), 32 deletions(-) create mode 100644 libcudacxx/include/cuda/__functional/address_stability.h create mode 100644 thrust/testing/address_stability.cu diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 4094a265880..126fa75ee43 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -950,14 +950,15 @@ struct dispatch_tmax_occupancy * block_dim * loaded_bytes_per_iter); - // Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements). - const int items_per_thread_evenly_spread = - static_cast(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim))); + // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements + const int items_per_thread_evenly_spread = static_cast( + ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy))); const int items_per_thread_clamped = ::cuda::std::clamp( items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread); diff --git a/docs/libcudacxx/extended_api/functional.rst b/docs/libcudacxx/extended_api/functional.rst index bbdcc3f4280..16a15907c62 100644 --- a/docs/libcudacxx/extended_api/functional.rst +++ b/docs/libcudacxx/extended_api/functional.rst @@ -18,6 +18,10 @@ Function wrapper - Creates a forwarding call wrapper that proclaims return type - libcu++ 1.9.0 / CCCL 2.0.0 / CUDA 11.8 + * - ``cuda::proclaim_copyable_arguments`` + - Creates a forwarding call wrapper that proclaims that arguments can be freely copied before an invocation of the wrapped callable + - CCCL 2.8.0 + * - :ref:`cuda::get_device_address ` - Returns a valid address to a device object - CCCL 2.8.0 diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h new file mode 100644 index 00000000000..719a061242b --- /dev/null +++ b/libcudacxx/include/cuda/__functional/address_stability.h @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H +#define _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! Trait telling whether a function object type F does not rely on the memory addresses of its arguments. The nested +//! value is true when the addresses of the arguments do not matter and arguments can be provided from arbitrary copies +//! of the respective sources. This trait can be specialized for custom function objects types. +//! @see proclaim_copyable_arguments +template +struct proclaims_copyable_arguments : _CUDA_VSTD::false_type +{}; + +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) +template +_CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copyable_arguments::value; +#endif // !_CCCL_NO_VARIABLE_TEMPLATES + +// Wrapper for a callable to mark it as permitting copied arguments +template +struct __callable_permitting_copied_arguments : F +{ + using F::operator(); +}; + +template +struct proclaims_copyable_arguments<__callable_permitting_copied_arguments> : _CUDA_VSTD::true_type +{}; + +//! Creates a new function object from an existing one, which is marked as permitting its arguments to be copies of +//! whatever source they come from. This implies that the addresses of the arguments are irrelevant to the function +//! object. +//! @see proclaims_copyable_arguments +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments +{ + return __callable_permitting_copied_arguments{_CUDA_VSTD::move(f)}; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional index 96b356f6efe..2bc74fb5a98 100644 --- a/libcudacxx/include/cuda/functional +++ b/libcudacxx/include/cuda/functional @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu index e2014e50808..e0b097b2f14 100644 --- a/thrust/benchmarks/bench/transform/basic.cu +++ b/thrust/benchmarks/bench/transform/basic.cu @@ -33,6 +33,8 @@ #include #include +#include + #include template @@ -106,7 +108,7 @@ constexpr auto startC = 3; // BabelStream: 0.1 constexpr auto startScalar = 4; // BabelStream: 0.4 using element_types = nvbench::type_list; -auto array_size_powers = std::vector{25}; +auto array_size_powers = std::vector{25}; // BabelStream uses 2^25, H200 can fit 2^31 template static void mul(nvbench::state& state, nvbench::type_list) @@ -121,9 +123,10 @@ static void mul(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { const T scalar = startScalar; - thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) { - return ci * scalar; - }); + thrust::transform( + c.begin(), c.end(), b.begin(), cuda::proclaim_copyable_arguments([=] __device__ __host__(const T& ci) { + return ci * scalar; + })); }); } @@ -145,9 +148,14 @@ static void add(nvbench::state& state, nvbench::type_list) state.add_global_memory_writes(n); state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { - thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) { - return ai + bi; - }); + thrust::transform( + a.begin(), + a.end(), + b.begin(), + c.begin(), + cuda::proclaim_copyable_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T { + return ai + bi; + })); }); } @@ -170,9 +178,14 @@ static void triad(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { const T scalar = startScalar; - thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) { - return bi + scalar * ci; - }); + thrust::transform( + b.begin(), + b.end(), + c.begin(), + a.begin(), + cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) { + return bi + scalar * ci; + })); }); } @@ -199,9 +212,11 @@ static void nstream(nvbench::state& state, nvbench::type_list) thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()), thrust::make_zip_iterator(a.end(), b.end(), c.end()), a.begin(), - thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) { - return ai + bi + scalar * ci; - })); + + thrust::make_zip_function( + cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { + return ai + bi + scalar * ci; + }))); }); } @@ -209,4 +224,35 @@ NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types)) .set_name("nstream") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", array_size_powers); + +// variation of nstream requiring a stable parameter address because it recovers the element index +template +static void nstream_stable(nvbench::state& state, nvbench::type_list) +{ + const auto n = static_cast(state.get_int64("Elements")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + const T* a_start = thrust::raw_pointer_cast(a.data()); + const T* b_start = thrust::raw_pointer_cast(b.data()); + const T* c_start = thrust::raw_pointer_cast(c.data()); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) { + const T scalar = startScalar; + thrust::transform(a.begin(), a.end(), a.begin(), [=] _CCCL_DEVICE(const T& ai) { + const auto i = &ai - a_start; + return ai + b_start[i] + scalar * c_start[i]; + }); + }); +} + +NVBENCH_BENCH_TYPES(nstream_stable, NVBENCH_TYPE_AXES(element_types)) + .set_name("nstream_stable") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", array_size_powers); } // namespace babelstream diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu new file mode 100644 index 00000000000..b9a4fc5f0ba --- /dev/null +++ b/thrust/testing/address_stability.cu @@ -0,0 +1,24 @@ +#include + +#include + +struct my_plus +{ + _CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int + { + return a + b; + } +}; + +void TestAddressStability() +{ + using ::cuda::proclaim_copyable_arguments; + using ::cuda::proclaims_copyable_arguments; + + static_assert(!proclaims_copyable_arguments>::value, ""); + static_assert(proclaims_copyable_arguments{}))>::value, ""); + + static_assert(!proclaims_copyable_arguments::value, ""); + static_assert(proclaims_copyable_arguments::value, ""); +} +DECLARE_UNITTEST(TestAddressStability); diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu index bd9f8006db1..888264ffce2 100644 --- a/thrust/testing/cuda/transform.cu +++ b/thrust/testing/cuda/transform.cu @@ -344,3 +344,80 @@ void TestTransformBinaryCudaStreams() cudaStreamDestroy(s); } DECLARE_UNITTEST(TestTransformBinaryCudaStreams); + +struct sum_five +{ + _CCCL_HOST_DEVICE auto + operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double + { + return a + b + c + d + e; + } +}; + +// The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017 +#ifndef _CCCL_COMPILER_MSVC_2017 +// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the +// zip_function is actually called (and not unwrapped) +THRUST_NAMESPACE_BEGIN +template <> +class zip_function +{ +public: + _CCCL_HOST_DEVICE zip_function(sum_five func) + : func(func) + {} + + _CCCL_HOST_DEVICE sum_five& underlying_function() const + { + return func; + } + + template + _CCCL_HOST_DEVICE auto + operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval(), THRUST_FWD(t))) + { + // not calling func, so we would get a wrong result if we were called + return {}; + } + +private: + mutable sum_five func; +}; +THRUST_NAMESPACE_END + +// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams +void TestTransformZipIteratorUnwrapping() +{ + constexpr int num_items = 100; + thrust::device_vector a(num_items, 1); + thrust::device_vector b(num_items, 2); + thrust::device_vector c(num_items, 3); + thrust::device_vector d(num_items, 4); + thrust::device_vector e(num_items, 5); + + thrust::device_vector result(num_items); + // SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2 + { + const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()); + thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{})); + + // compute reference and verify + thrust::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + ASSERT_EQUAL(reference, result); + } + // SECTION("trice") + { + const auto z = thrust::make_zip_iterator( + thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()))); + thrust::transform(z, + z + num_items, + result.begin(), + thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{})))); + + // compute reference and verify + thrust::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + ASSERT_EQUAL(reference, result); + } +} +DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping); +#endif // !_CCCL_COMPILER_MSVC_2017 diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h index 3f4f148f5d4..75e04d58306 100644 --- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h @@ -50,6 +50,10 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { +// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header +template +OutputIt THRUST_FUNCTION +transform(execution_policy& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op); namespace __copy { diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 1926fb62473..d1f84241205 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -39,26 +39,32 @@ #ifdef _CCCL_CUDA_COMPILER # include -# include +# include + # include +# include +# include # include # include +# include + +# include + +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub { - namespace __transform { - struct no_stencil_tag {}; struct always_true_predicate { template - bool THRUST_DEVICE_FUNCTION operator()(T const&) const + constexpr bool THRUST_DEVICE_FUNCTION operator()(T const&) const { return true; } @@ -235,6 +241,63 @@ OutputIt _CCCL_HOST_DEVICE inline binary( return result + num_items; } +_CCCL_EXEC_CHECK_DISABLE +template +OutputIt THRUST_FUNCTION cub_transform_many( + execution_policy& policy, + ::cuda::std::tuple firsts, + OutputIt result, + Offset num_items, + TransformOp transform_op) +{ + if (num_items == 0) + { + return result; + } + + constexpr auto requires_stable_address = !::cuda::proclaims_copyable_arguments::value; + + cudaError_t status; + THRUST_INDEX_TYPE_DISPATCH( + status, + (cub::detail::transform::dispatch_t, + OutputIt, + TransformOp>::dispatch), + num_items, + (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy))); + throw_on_error(status, "transform: failed inside CUB"); + + status = cuda_cub::synchronize_optional(policy); + throw_on_error(status, "transform: failed to synchronize"); + + return result + num_items; +} + +template +THRUST_FUNCTION auto +convert_to_std_tuple(tuple t, ::cuda::std::index_sequence) -> ::cuda::std::tuple +{ + return ::cuda::std::tuple{get(t)...}; +} + +// unwrap zip_iterator and zip_function into their underlying iterators so cub::DeviceTransform can optimize them +template +OutputIt THRUST_FUNCTION cub_transform_many( + execution_policy& policy, + ::cuda::std::tuple>> firsts, + OutputIt result, + Offset num_items, + zip_function transform_op) +{ + return cub_transform_many( + policy, + convert_to_std_tuple(get<0>(firsts).get_iterator_tuple(), ::cuda::std::index_sequence_for{}), + result, + num_items, + transform_op.underlying_function()); +} } // namespace __transform //------------------------- @@ -276,7 +339,15 @@ template OutputIt THRUST_FUNCTION transform(execution_policy& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op) { - return cuda_cub::transform_if(policy, first, last, result, transform_op, __transform::always_true_predicate()); + THRUST_CDP_DISPATCH( + (using size_type = typename iterator_traits::difference_type; + const auto num_items = static_cast(thrust::distance(first, last)); + return __transform::cub_transform_many(policy, ::cuda::std::make_tuple(first), result, num_items, transform_op);), + (while (first != last) { + *result = transform_op(raw_reference_cast(*first)); + ++first; + ++result; + } return result;)); } // func transform //------------------------- @@ -314,17 +385,18 @@ OutputIt THRUST_FUNCTION transform( OutputIt result, TransformOp transform_op) { - return cuda_cub::transform_if( - policy, - first1, - last1, - first2, - __transform::no_stencil_tag(), - result, - transform_op, - __transform::always_true_predicate()); -} // func transform - + THRUST_CDP_DISPATCH( + (using size_type = typename iterator_traits::difference_type; + const auto num_items = static_cast(thrust::distance(first1, last1)); + return __transform::cub_transform_many( + policy, ::cuda::std::make_tuple(first1, first2), result, num_items, transform_op);), + (while (first1 != last1) { + *result = transform_op(raw_reference_cast(*first1), raw_reference_cast(*first2)); + ++first1; + ++first2; + ++result; + } return result;)); +} } // namespace cuda_cub THRUST_NAMESPACE_END