From c97f2e3588403308ce0e6dd1ed7dbb863eeaa025 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 6 Nov 2024 13:47:15 +0100
Subject: [PATCH] Make `thrust::transform` use `cub::DeviceTransform` (#2389)

* Add transform benchmark requiring a stable address
* Make thrust::transform use cub::DeviceTransform
* Introduces address stability detection and opt-in in libcu++
* Mark lambdas in Thrust BabelStream benchmark address oblivious
* Optimize prefetch cub::DeviceTransform for small problems

Fixes: #2263
---
 .../device/dispatch/dispatch_transform.cuh    |   7 +-
 docs/libcudacxx/extended_api/functional.rst   |   4 +
 .../cuda/__functional/address_stability.h     |  65 +++++++++++
 libcudacxx/include/cuda/functional            |   1 +
 thrust/benchmarks/bench/transform/basic.cu    |  72 +++++++++---
 thrust/testing/address_stability.cu           |  24 ++++
 thrust/testing/cuda/transform.cu              |  77 +++++++++++++
 .../detail/internal/copy_device_to_device.h   |   4 +
 thrust/thrust/system/cuda/detail/transform.h  | 104 +++++++++++++++---
 9 files changed, 326 insertions(+), 32 deletions(-)
 create mode 100644 libcudacxx/include/cuda/__functional/address_stability.h
 create mode 100644 thrust/testing/address_stability.cu
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 4094a265880..126fa75ee43 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -950,14 +950,15 @@ struct dispatch_t<RequiresStableAddress,
       return config.error;
     }
 
+    // choose items per thread to reach minimum bytes in flight
     const int items_per_thread =
       loaded_bytes_per_iter == 0
         ? +policy_t::items_per_thread_no_input
         : ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);
 
-    // Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
-    const int items_per_thread_evenly_spread =
-      static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
+    // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements
+    const int items_per_thread_evenly_spread = static_cast<int>(
+      ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
 
     const int items_per_thread_clamped = ::cuda::std::clamp(
       items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
diff --git a/docs/libcudacxx/extended_api/functional.rst b/docs/libcudacxx/extended_api/functional.rst
index bbdcc3f4280..16a15907c62 100644
--- a/docs/libcudacxx/extended_api/functional.rst
+++ b/docs/libcudacxx/extended_api/functional.rst
@@ -18,6 +18,10 @@ Function wrapper
      - Creates a forwarding call wrapper that proclaims return type
      - libcu++ 1.9.0 / CCCL 2.0.0 / CUDA 11.8
 
+   * - ``cuda::proclaim_copyable_arguments``
+     - Creates a forwarding call wrapper that proclaims that arguments can be freely copied before an invocation of the wrapped callable
+     - CCCL 2.8.0
+
    * - :ref:`cuda::get_device_address <libcudacxx-extended-api-functional-get-device-address>`
      - Returns a valid address to a device object
      - CCCL 2.8.0
diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h
new file mode 100644
index 00000000000..719a061242b
--- /dev/null
+++ b/libcudacxx/include/cuda/__functional/address_stability.h
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
+#define _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/move.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! Trait telling whether a function object type F does not rely on the memory addresses of its arguments. The nested
+//! value is true when the addresses of the arguments do not matter and arguments can be provided from arbitrary copies
+//! of the respective sources. This trait can be specialized for custom function objects types.
+//! @see proclaim_copyable_arguments
+template <typename F, typename SFINAE = void>
+struct proclaims_copyable_arguments : _CUDA_VSTD::false_type
+{};
+
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+template <typename F, typename... Args>
+_CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copyable_arguments<F, Args...>::value;
+#endif // !_CCCL_NO_VARIABLE_TEMPLATES
+
+// Wrapper for a callable to mark it as permitting copied arguments
+template <typename F>
+struct __callable_permitting_copied_arguments : F
+{
+  using F::operator();
+};
+
+template <typename F>
+struct proclaims_copyable_arguments<__callable_permitting_copied_arguments<F>> : _CUDA_VSTD::true_type
+{};
+
+//! Creates a new function object from an existing one, which is marked as permitting its arguments to be copies of
+//! whatever source they come from. This implies that the addresses of the arguments are irrelevant to the function
+//! object.
+//! @see proclaims_copyable_arguments
+template <typename F>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments<F>
+{
+  return __callable_permitting_copied_arguments<F>{_CUDA_VSTD::move(f)};
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
index 96b356f6efe..2bc74fb5a98 100644
--- a/libcudacxx/include/cuda/functional
+++ b/libcudacxx/include/cuda/functional
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__functional/address_stability.h>
 #include <cuda/__functional/get_device_address.h>
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
index e2014e50808..e0b097b2f14 100644
--- a/thrust/benchmarks/bench/transform/basic.cu
+++ b/thrust/benchmarks/bench/transform/basic.cu
@@ -33,6 +33,8 @@
 #include <thrust/transform.h>
 #include <thrust/zip_function.h>
 
+#include <cuda/__functional/address_stability.h>
+
 #include <nvbench_helper.cuh>
 
 template <class InT, class OutT>
@@ -106,7 +108,7 @@ constexpr auto startC      = 3; // BabelStream: 0.1
 constexpr auto startScalar = 4; // BabelStream: 0.4
 
 using element_types    = nvbench::type_list<std::int8_t, std::int16_t, float, double, __int128>;
-auto array_size_powers = std::vector<std::int64_t>{25};
+auto array_size_powers = std::vector<std::int64_t>{25}; // BabelStream uses 2^25, H200 can fit 2^31
 
 template <typename T>
 static void mul(nvbench::state& state, nvbench::type_list<T>)
@@ -121,9 +123,10 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
     const T scalar = startScalar;
-    thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
-      return ci * scalar;
-    });
+    thrust::transform(
+      c.begin(), c.end(), b.begin(), cuda::proclaim_copyable_arguments([=] __device__ __host__(const T& ci) {
+        return ci * scalar;
+      }));
   });
 }
 
@@ -145,9 +148,14 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(n);
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
-    thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
-      return ai + bi;
-    });
+    thrust::transform(
+      a.begin(),
+      a.end(),
+      b.begin(),
+      c.begin(),
+      cuda::proclaim_copyable_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+        return ai + bi;
+      }));
   });
 }
 
@@ -170,9 +178,14 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)
 
   state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
     const T scalar = startScalar;
-    thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
-      return bi + scalar * ci;
-    });
+    thrust::transform(
+      b.begin(),
+      b.end(),
+      c.begin(),
+      a.begin(),
+      cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) {
+        return bi + scalar * ci;
+      }));
   });
 }
 
@@ -199,9 +212,11 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
       thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
       thrust::make_zip_iterator(a.end(), b.end(), c.end()),
       a.begin(),
-      thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {
-        return ai + bi + scalar * ci;
-      }));
+
+      thrust::make_zip_function(
+        cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+          return ai + bi + scalar * ci;
+        })));
   });
 }
 
@@ -209,4 +224,35 @@ NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
   .set_name("nstream")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", array_size_powers);
+
+// variation of nstream requiring a stable parameter address because it recovers the element index
+template <typename T>
+static void nstream_stable(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  const T* a_start = thrust::raw_pointer_cast(a.data());
+  const T* b_start = thrust::raw_pointer_cast(b.data());
+  const T* c_start = thrust::raw_pointer_cast(c.data());
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    const T scalar = startScalar;
+    thrust::transform(a.begin(), a.end(), a.begin(), [=] _CCCL_DEVICE(const T& ai) {
+      const auto i = &ai - a_start;
+      return ai + b_start[i] + scalar * c_start[i];
+    });
+  });
+}
+
+NVBENCH_BENCH_TYPES(nstream_stable, NVBENCH_TYPE_AXES(element_types))
+  .set_name("nstream_stable")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", array_size_powers);
 } // namespace babelstream
diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu
new file mode 100644
index 00000000000..b9a4fc5f0ba
--- /dev/null
+++ b/thrust/testing/address_stability.cu
@@ -0,0 +1,24 @@
+#include <cuda/__functional/address_stability.h>
+
+#include <unittest/unittest.h>
+
+struct my_plus
+{
+  _CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int
+  {
+    return a + b;
+  }
+};
+
+void TestAddressStability()
+{
+  using ::cuda::proclaim_copyable_arguments;
+  using ::cuda::proclaims_copyable_arguments;
+
+  static_assert(!proclaims_copyable_arguments<thrust::plus<int>>::value, "");
+  static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(thrust::plus<int>{}))>::value, "");
+
+  static_assert(!proclaims_copyable_arguments<my_plus>::value, "");
+  static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(my_plus{}))>::value, "");
+}
+DECLARE_UNITTEST(TestAddressStability);
diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu
index bd9f8006db1..888264ffce2 100644
--- a/thrust/testing/cuda/transform.cu
+++ b/thrust/testing/cuda/transform.cu
@@ -344,3 +344,80 @@ void TestTransformBinaryCudaStreams()
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestTransformBinaryCudaStreams);
+
+struct sum_five
+{
+  _CCCL_HOST_DEVICE auto
+  operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  {
+    return a + b + c + d + e;
+  }
+};
+
+// The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017
+#ifndef _CCCL_COMPILER_MSVC_2017
+// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the
+// zip_function is actually called (and not unwrapped)
+THRUST_NAMESPACE_BEGIN
+template <>
+class zip_function<sum_five>
+{
+public:
+  _CCCL_HOST_DEVICE zip_function(sum_five func)
+      : func(func)
+  {}
+
+  _CCCL_HOST_DEVICE sum_five& underlying_function() const
+  {
+    return func;
+  }
+
+  template <typename Tuple>
+  _CCCL_HOST_DEVICE auto
+  operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
+  {
+    // not calling func, so we would get a wrong result if we were called
+    return {};
+  }
+
+private:
+  mutable sum_five func;
+};
+THRUST_NAMESPACE_END
+
+// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams
+void TestTransformZipIteratorUnwrapping()
+{
+  constexpr int num_items = 100;
+  thrust::device_vector<std::int8_t> a(num_items, 1);
+  thrust::device_vector<std::int16_t> b(num_items, 2);
+  thrust::device_vector<std::int32_t> c(num_items, 3);
+  thrust::device_vector<std::int64_t> d(num_items, 4);
+  thrust::device_vector<float> e(num_items, 5);
+
+  thrust::device_vector<double> result(num_items);
+  // SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2
+  {
+    const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin());
+    thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{}));
+
+    // compute reference and verify
+    thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+    ASSERT_EQUAL(reference, result);
+  }
+  // SECTION("trice")
+  {
+    const auto z = thrust::make_zip_iterator(
+      thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin())));
+    thrust::transform(z,
+                      z + num_items,
+                      result.begin(),
+                      thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{}))));
+
+    // compute reference and verify
+    thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+    ASSERT_EQUAL(reference, result);
+  }
+}
+DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
+#endif // !_CCCL_COMPILER_MSVC_2017
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
index 3f4f148f5d4..75e04d58306 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -50,6 +50,10 @@
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
+// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header
+template <class Derived, class InputIt, class OutputIt, class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
 
 namespace __copy
 {
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 1926fb62473..d1f84241205 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -39,26 +39,32 @@
 #ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
-#  include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#  include <cub/device/device_transform.cuh>
+
 #  include <thrust/distance.h>
+#  include <thrust/iterator/zip_iterator.h>
+#  include <thrust/system/cuda/detail/dispatch.h>
 #  include <thrust/system/cuda/detail/parallel_for.h>
 #  include <thrust/system/cuda/detail/util.h>
+#  include <thrust/zip_function.h>
+
+#  include <cuda/__functional/address_stability.h>
+
+#  include <cstdint>
 
 THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub
 {
-
 namespace __transform
 {
-
 struct no_stencil_tag
 {};
 
 struct always_true_predicate
 {
   template <class T>
-  bool THRUST_DEVICE_FUNCTION operator()(T const&) const
+  constexpr bool THRUST_DEVICE_FUNCTION operator()(T const&) const
   {
     return true;
   }
@@ -235,6 +241,63 @@ OutputIt _CCCL_HOST_DEVICE inline binary(
   return result + num_items;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
+template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp>
+OutputIt THRUST_FUNCTION cub_transform_many(
+  execution_policy<Derived>& policy,
+  ::cuda::std::tuple<InputIts...> firsts,
+  OutputIt result,
+  Offset num_items,
+  TransformOp transform_op)
+{
+  if (num_items == 0)
+  {
+    return result;
+  }
+
+  constexpr auto requires_stable_address = !::cuda::proclaims_copyable_arguments<TransformOp>::value;
+
+  cudaError_t status;
+  THRUST_INDEX_TYPE_DISPATCH(
+    status,
+    (cub::detail::transform::dispatch_t<requires_stable_address,
+                                        decltype(num_items_fixed),
+                                        ::cuda::std::tuple<InputIts...>,
+                                        OutputIt,
+                                        TransformOp>::dispatch),
+    num_items,
+    (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy)));
+  throw_on_error(status, "transform: failed inside CUB");
+
+  status = cuda_cub::synchronize_optional(policy);
+  throw_on_error(status, "transform: failed to synchronize");
+
+  return result + num_items;
+}
+
+template <typename... Ts, std::size_t... Is>
+THRUST_FUNCTION auto
+convert_to_std_tuple(tuple<Ts...> t, ::cuda::std::index_sequence<Is...>) -> ::cuda::std::tuple<Ts...>
+{
+  return ::cuda::std::tuple<Ts...>{get<Is>(t)...};
+}
+
+// unwrap zip_iterator and zip_function into their underlying iterators so cub::DeviceTransform can optimize them
+template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp>
+OutputIt THRUST_FUNCTION cub_transform_many(
+  execution_policy<Derived>& policy,
+  ::cuda::std::tuple<zip_iterator<tuple<InputIts...>>> firsts,
+  OutputIt result,
+  Offset num_items,
+  zip_function<TransformOp> transform_op)
+{
+  return cub_transform_many(
+    policy,
+    convert_to_std_tuple(get<0>(firsts).get_iterator_tuple(), ::cuda::std::index_sequence_for<InputIts...>{}),
+    result,
+    num_items,
+    transform_op.underlying_function());
+}
 } // namespace __transform
 
 //-------------------------
@@ -276,7 +339,15 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
 OutputIt THRUST_FUNCTION
 transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op)
 {
-  return cuda_cub::transform_if(policy, first, last, result, transform_op, __transform::always_true_predicate());
+  THRUST_CDP_DISPATCH(
+    (using size_type      = typename iterator_traits<InputIt>::difference_type;
+     const auto num_items = static_cast<size_type>(thrust::distance(first, last));
+     return __transform::cub_transform_many(policy, ::cuda::std::make_tuple(first), result, num_items, transform_op);),
+    (while (first != last) {
+      *result = transform_op(raw_reference_cast(*first));
+      ++first;
+      ++result;
+    } return result;));
 } // func transform
 
 //-------------------------
@@ -314,17 +385,18 @@ OutputIt THRUST_FUNCTION transform(
   OutputIt result,
   TransformOp transform_op)
 {
-  return cuda_cub::transform_if(
-    policy,
-    first1,
-    last1,
-    first2,
-    __transform::no_stencil_tag(),
-    result,
-    transform_op,
-    __transform::always_true_predicate());
-} // func transform
-
+  THRUST_CDP_DISPATCH(
+    (using size_type      = typename iterator_traits<InputIt1>::difference_type;
+     const auto num_items = static_cast<size_type>(thrust::distance(first1, last1));
+     return __transform::cub_transform_many(
+       policy, ::cuda::std::make_tuple(first1, first2), result, num_items, transform_op);),
+    (while (first1 != last1) {
+      *result = transform_op(raw_reference_cast(*first1), raw_reference_cast(*first2));
+      ++first1;
+      ++first2;
+      ++result;
+    } return result;));
+}
 } // namespace cuda_cub
 
 THRUST_NAMESPACE_END