Make thrust::transform use cub::DeviceTransform (#2389)

* Add transform benchmark requiring a stable address * Make thrust::transform use cub::DeviceTransform * Introduces address stability detection and opt-in in libcu++ * Mark lambdas in Thrust BabelStream benchmark address oblivious * Optimize prefetch cub::DeviceTransform for small problems Fixes: #2263
NVIDIA · Nov 6, 2024 · c97f2e3 · c97f2e3
1 parent c358bde
commit c97f2e3
Show file tree

Hide file tree

Showing 9 changed files with 326 additions and 32 deletions.
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -950,14 +950,15 @@ struct dispatch_t<RequiresStableAddress,
       return config.error;
     }
 
+    // choose items per thread to reach minimum bytes in flight
     const int items_per_thread =
       loaded_bytes_per_iter == 0
         ? +policy_t::items_per_thread_no_input
         : ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);
 
-    // Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
-    const int items_per_thread_evenly_spread =
-      static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
+    // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements
+    const int items_per_thread_evenly_spread = static_cast<int>(
+      ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
 
     const int items_per_thread_clamped = ::cuda::std::clamp(
       items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);

diff --git a/docs/libcudacxx/extended_api/functional.rst b/docs/libcudacxx/extended_api/functional.rst
@@ -18,6 +18,10 @@ Function wrapper
      - Creates a forwarding call wrapper that proclaims return type
      - libcu++ 1.9.0 / CCCL 2.0.0 / CUDA 11.8
 
+   * - ``cuda::proclaim_copyable_arguments``
+     - Creates a forwarding call wrapper that proclaims that arguments can be freely copied before an invocation of the wrapped callable
+     - CCCL 2.8.0
+
    * - :ref:`cuda::get_device_address <libcudacxx-extended-api-functional-get-device-address>`
      - Returns a valid address to a device object
      - CCCL 2.8.0
diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
+#define _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/move.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! Trait telling whether a function object type F does not rely on the memory addresses of its arguments. The nested
+//! value is true when the addresses of the arguments do not matter and arguments can be provided from arbitrary copies
+//! of the respective sources. This trait can be specialized for custom function objects types.
+//! @see proclaim_copyable_arguments
+template <typename F, typename SFINAE = void>
+struct proclaims_copyable_arguments : _CUDA_VSTD::false_type
+{};
+
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+template <typename F, typename... Args>
+_CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copyable_arguments<F, Args...>::value;
+#endif // !_CCCL_NO_VARIABLE_TEMPLATES
+
+// Wrapper for a callable to mark it as permitting copied arguments
+template <typename F>
+struct __callable_permitting_copied_arguments : F
+{
+  using F::operator();
+};
+
+template <typename F>
+struct proclaims_copyable_arguments<__callable_permitting_copied_arguments<F>> : _CUDA_VSTD::true_type
+{};
+
+//! Creates a new function object from an existing one, which is marked as permitting its arguments to be copies of
+//! whatever source they come from. This implies that the addresses of the arguments are irrelevant to the function
+//! object.
+//! @see proclaims_copyable_arguments
+template <typename F>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments<F>
+{
+  return __callable_permitting_copied_arguments<F>{_CUDA_VSTD::move(f)};
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__functional/address_stability.h>
 #include <cuda/__functional/get_device_address.h>
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>

diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
@@ -33,6 +33,8 @@
 #include <thrust/transform.h>
 #include <thrust/zip_function.h>
 
+#include <cuda/__functional/address_stability.h>
+
 #include <nvbench_helper.cuh>
 
 template <class InT, class OutT>
@@ -106,7 +108,7 @@ constexpr auto startC      = 3; // BabelStream: 0.1
 constexpr auto startScalar = 4; // BabelStream: 0.4
 
 using element_types    = nvbench::type_list<std::int8_t, std::int16_t, float, double, __int128>;
-auto array_size_powers = std::vector<std::int64_t>{25};
+auto array_size_powers = std::vector<std::int64_t>{25}; // BabelStream uses 2^25, H200 can fit 2^31
 
 template <typename T>
 static void mul(nvbench::state& state, nvbench::type_list<T>)
@@ -121,9 +123,10 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
     const T scalar = startScalar;
-    thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
-      return ci * scalar;
-    });
+    thrust::transform(
+      c.begin(), c.end(), b.begin(), cuda::proclaim_copyable_arguments([=] __device__ __host__(const T& ci) {
+        return ci * scalar;
+      }));
   });
 }
 
@@ -145,9 +148,14 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
-    thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
-      return ai + bi;
-    });
+    thrust::transform(
+      a.begin(),
+      a.end(),
+      b.begin(),
+      c.begin(),
+      cuda::proclaim_copyable_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+        return ai + bi;
+      }));
   });
 }
 
@@ -170,9 +178,14 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
     const T scalar = startScalar;
-    thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
-      return bi + scalar * ci;
-    });
+    thrust::transform(
+      b.begin(),
+      b.end(),
+      c.begin(),
+      a.begin(),
+      cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) {
+        return bi + scalar * ci;
+      }));
   });
 }
 
@@ -199,14 +212,47 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
       thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
       thrust::make_zip_iterator(a.end(), b.end(), c.end()),
       a.begin(),
-      thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {
-        return ai + bi + scalar * ci;
-      }));
+
+      thrust::make_zip_function(
+        cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+          return ai + bi + scalar * ci;
+        })));
   });
 }
 
 NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
   .set_name("nstream")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", array_size_powers);
+
+// variation of nstream requiring a stable parameter address because it recovers the element index
+template <typename T>
+static void nstream_stable(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  const T* a_start = thrust::raw_pointer_cast(a.data());
+  const T* b_start = thrust::raw_pointer_cast(b.data());
+  const T* c_start = thrust::raw_pointer_cast(c.data());
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    const T scalar = startScalar;
+    thrust::transform(a.begin(), a.end(), a.begin(), [=] _CCCL_DEVICE(const T& ai) {
+      const auto i = &ai - a_start;
+      return ai + b_start[i] + scalar * c_start[i];
+    });
+  });
+}
+
+NVBENCH_BENCH_TYPES(nstream_stable, NVBENCH_TYPE_AXES(element_types))
+  .set_name("nstream_stable")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
 } // namespace babelstream
diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu
@@ -0,0 +1,24 @@
+#include <cuda/__functional/address_stability.h>
+
+#include <unittest/unittest.h>
+
+struct my_plus
+{
+  _CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int
+  {
+    return a + b;
+  }
+};
+
+void TestAddressStability()
+{
+  using ::cuda::proclaim_copyable_arguments;
+  using ::cuda::proclaims_copyable_arguments;
+
+  static_assert(!proclaims_copyable_arguments<thrust::plus<int>>::value, "");
+  static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(thrust::plus<int>{}))>::value, "");
+
+  static_assert(!proclaims_copyable_arguments<my_plus>::value, "");
+  static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(my_plus{}))>::value, "");
+}
+DECLARE_UNITTEST(TestAddressStability);
diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu
@@ -344,3 +344,80 @@ void TestTransformBinaryCudaStreams()
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestTransformBinaryCudaStreams);
+
+struct sum_five
+{
+  _CCCL_HOST_DEVICE auto
+  operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  {
+    return a + b + c + d + e;
+  }
+};
+
+// The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017
+#ifndef _CCCL_COMPILER_MSVC_2017
+// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the
+// zip_function is actually called (and not unwrapped)
+THRUST_NAMESPACE_BEGIN
+template <>
+class zip_function<sum_five>
+{
+public:
+  _CCCL_HOST_DEVICE zip_function(sum_five func)
+      : func(func)
+  {}
+
+  _CCCL_HOST_DEVICE sum_five& underlying_function() const
+  {
+    return func;
+  }
+
+  template <typename Tuple>
+  _CCCL_HOST_DEVICE auto
+  operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
+  {
+    // not calling func, so we would get a wrong result if we were called
+    return {};
+  }
+
+private:
+  mutable sum_five func;
+};
+THRUST_NAMESPACE_END
+
+// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams
+void TestTransformZipIteratorUnwrapping()
+{
+  constexpr int num_items = 100;
+  thrust::device_vector<std::int8_t> a(num_items, 1);
+  thrust::device_vector<std::int16_t> b(num_items, 2);
+  thrust::device_vector<std::int32_t> c(num_items, 3);
+  thrust::device_vector<std::int64_t> d(num_items, 4);
+  thrust::device_vector<float> e(num_items, 5);
+
+  thrust::device_vector<double> result(num_items);
+  // SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2
+  {
+    const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin());
+    thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{}));
+
+    // compute reference and verify
+    thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+    ASSERT_EQUAL(reference, result);
+  }
+  // SECTION("trice")
+  {
+    const auto z = thrust::make_zip_iterator(
+      thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin())));
+    thrust::transform(z,
+                      z + num_items,
+                      result.begin(),
+                      thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{}))));
+
+    // compute reference and verify
+    thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+    ASSERT_EQUAL(reference, result);
+  }
+}
+DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
+#endif // !_CCCL_COMPILER_MSVC_2017
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -50,6 +50,10 @@
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
+// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header
+template <class Derived, class InputIt, class OutputIt, class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
 
 namespace __copy
 {