NVIDIA · miscco · Nov 17, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 15, 2025
@@ -21,6 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__execution/policy.h>
@@ -45,8 +47,10 @@ struct any_execution_policy
   _CCCL_HIDE_FROM_ABI any_execution_policy() = default;
 
   template <uint32_t _Policy>
-  _CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__execution_policy_base<_Policy>) noexcept
+  _CCCL_HOST_API constexpr any_execution_policy(
+    const ::cuda::std::execution::__execution_policy_base<_Policy>& __pol) noexcept
       : value(value_type{_Policy})
+      , stream(::cuda::std::execution::__query_or(__pol, ::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}))
   {}
 
   _CCCL_HOST_API constexpr operator __execution_policy() const noexcept
@@ -89,7 +93,15 @@ struct any_execution_policy
   }
 #endif // _CCCL_STD_VER <= 2017
 
-  __execution_policy value = __execution_policy::__invalid_execution_policy;
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+  [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref get_stream() const noexcept
+  {
+    return stream;
+  }
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+  __execution_policy value  = __execution_policy::__invalid_execution_policy;
+  ::cuda::stream_ref stream = ::cuda::stream_ref{cudaStreamPerThread};
 };
 
 struct get_execution_policy_t;

@@ -23,17 +23,204 @@
 #if _CCCL_HAS_BACKEND_CUDA()
 
 #  include <cuda/__fwd/execution_policy.h>
+#  include <cuda/__memory_resource/device_memory_pool.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__memory_resource/resource.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__execution/policy.h>
 #  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/forward.h>
 
 #  include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
+template <bool _HasStream>
+struct __policy_stream_holder
+{
+  ::cuda::stream_ref __stream_;
+
+  _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref __stream) noexcept
+      : __stream_(__stream)
+  {}
+};
+
+template <>
+struct __policy_stream_holder<false>
+{
+  _CCCL_HIDE_FROM_ABI __policy_stream_holder() = default;
+
+  //! @brief Dummy constructor to simplify implementation of the cuda policy
+  _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref) noexcept {}
+};
+
+template <bool _HasResource>
+struct __policy_memory_resource_holder
+{
+  using __resource_t = ::cuda::mr::any_resource<::cuda::mr::device_accessible>;
+
+  __resource_t __resource_;
+
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&& __resource) noexcept
+      : __resource_(::cuda::std::forward<_Resource>(__resource))
+  {}
+};
+
+template <>
+struct __policy_memory_resource_holder<false>
+{
+  _CCCL_HIDE_FROM_ABI __policy_memory_resource_holder() = default;
+
+  //! @brief Dummy constructor to simplify implementation of the cuda policy
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&&) noexcept {}
+};
+
 template <uint32_t _Policy>
 struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
     : __execution_policy_base<_Policy, __execution_backend::__none>
-{};
+    , protected __policy_stream_holder<__cuda_policy_with_stream<_Policy>>
+    , protected __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>
+{
+private:
+  template <uint32_t, __execution_backend>
+  friend struct __execution_policy_base;
+
+  using __stream_holder   = __policy_stream_holder<__cuda_policy_with_stream<_Policy>>;
+  using __resource_holder = __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>;
+
+  template <uint32_t _OtherPolicy>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy) noexcept
+      : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
+  {}
+
+  template <uint32_t _OtherPolicy>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy,
+    ::cuda::stream_ref __stream) noexcept
+      : __stream_holder(__stream)
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
+  {}
+
+  template <uint32_t _OtherPolicy, class _Resource>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy, _Resource&& __resource) noexcept
+      : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(::cuda::std::forward<_Resource>(__resource))
+  {}
+
+public:
+  _CCCL_HIDE_FROM_ABI constexpr __execution_policy_base() noexcept = default;
+
+  //! @brief Convert to a policy that holds a stream
+  //! @note This cannot be merged with the other case where we already have a stream as this needs to be const qualified
+  _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>)
+  _CCCL_REQUIRES((!_WithStream))
+  [[nodiscard]] _CCCL_HOST_API auto set_stream(::cuda::stream_ref __stream) const noexcept
+  {
+    constexpr uint32_t __new_policy = __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_stream>;
+    return __execution_policy_base<__new_policy>{*this, __stream};
+  }
+
+  //! @brief Set the current stream
+  _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>)
+  _CCCL_REQUIRES(_WithStream)
+  [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_stream(::cuda::stream_ref __stream) noexcept
+  {
+    this->__stream_ = __stream;
+    return *this;
+  }
+
+  //! @brief Return the stream stored in the holder or a default stream
+  [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref query(const ::cuda::get_stream_t&) const noexcept
+  {
+    if constexpr (__cuda_policy_with_stream<_Policy>)
+    {
+      return this->__stream_;
+    }
+    else
+    {
+      return ::cuda::stream_ref{cudaStreamPerThread};
+    }
+  }
+
+  //! @brief Set the current memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND _WithResource)
+  [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_memory_resource(_Resource&& __resource) noexcept
+  {
+    this->__resource_ = __resource;
+    return *this;
+  }
+
+  //! @brief Convert to a policy that holds a memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND(!_WithResource))
+  [[nodiscard]] _CCCL_HOST_API auto set_memory_resource(_Resource&& __resource) const noexcept
+  {
+    constexpr uint32_t __new_policy =
+      __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_memory_resource>;
+    return __execution_policy_base<__new_policy>{*this, __resource};
+  }
+
+  //! @brief Return either a stored or a default memory resource
+  //! @note We cannot put that into the __policy_memory_resource_holder because we need a stream for the device
+  [[nodiscard]] _CCCL_HOST_API auto query(const ::cuda::mr::get_memory_resource_t&) const noexcept
+  {
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      return this->__resource_;
+    }
+    else
+    {
+      ::cuda::stream_ref __stream = this->query(::cuda::get_stream);
+      return ::cuda::device_default_memory_pool(__stream.device());
+    }
+  }
+
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
+  [[nodiscard]] _CCCL_API friend constexpr bool operator==(
+    const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept
+  {
+    if constexpr (_Policy != _OtherPolicy)
+    {
+      return false;
+    }
+
+    if constexpr (__cuda_policy_with_stream<_Policy>)
+    {
+      if (__lhs.query(::cuda::get_stream) != __rhs.query(::cuda::get_stream))
+      {
+        return false;
+      }
+    }
+
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      if (__lhs.query(::cuda::mr::get_memory_resource) != __rhs.query(::cuda::mr::get_memory_resource))
+      {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+#  if _CCCL_STD_VER <= 2017
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
+  [[nodiscard]] _CCCL_API friend constexpr bool operator!=(
+    const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept
+  {
+    return !(__lhs == __rhs);
+  }
+#  endif // _CCCL_STD_VER <= 2017
+};
 
 _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
 

@@ -28,6 +28,12 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
+enum __cuda_backend_options : uint16_t
+{
+  __with_stream          = 1 << 0, ///> Determines whether the policy holds a stream
+  __with_memory_resource = 1 << 1, ///> Determines whether the policy holds a memory resource
+};
+
 //! @brief Sets the execution backend to cuda
 template <uint32_t _Policy>
 [[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept
@@ -38,6 +44,26 @@ template <uint32_t _Policy>
   return __new_policy;
 }
 
+//! @brief Backend specific options of the CUDA backend
+template <uint32_t _Policy>
+inline constexpr __cuda_backend_options __policy_to_cuda_backend_options =
+  static_cast<__cuda_backend_options>((_Policy & uint32_t{0xFFFF0000}) >> 16);
+
+//! @brief Sets a backend specific option
+template <uint32_t _Policy, __cuda_backend_options __option>
+inline constexpr uint32_t __set_cuda_backend_option =
+  _Policy | static_cast<uint32_t>(static_cast<uint32_t>(__option) << 16);
+
+//! @brief Detects whether a given policy holds a user provided stream
+template <uint32_t _Policy>
+inline constexpr bool __cuda_policy_with_stream =
+  __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_stream;
+
+//! @brief Detects whether a given policy holds a user provided memory resource
+template <uint32_t _Policy>
+inline constexpr bool __cuda_policy_with_memory_resource =
+  __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_memory_resource;
+
 _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
 
 #  include <cuda/std/__cccl/epilogue.h>

@@ -30,6 +30,7 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _InputIterator, class _Tp, class _BinaryOp>
 [[nodiscard]] _CCCL_API constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 {

@@ -27,10 +27,12 @@
 
 #  include <cuda/__execution/policy.h>
 #  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__stream/get_stream.h>
 #  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__algorithm/for_each_n.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__exception/terminate.h>
+#  include <cuda/std/__execution/env.h>
 #  include <cuda/std/__execution/policy.h>
 #  include <cuda/std/__iterator/iterator_traits.h>
 #  include <cuda/std/__pstl/dispatch.h>
@@ -54,7 +56,8 @@ struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cu
   __par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept
   {
     const auto __count = ::cuda::std::__convert_to_integral(__orig_n);
-    ::cuda::stream_ref __stream{cudaStreamPerThread};
+
+    auto __stream = __policy.query(::cuda::get_stream);
 
     _CCCL_TRY_CUDA_API(
       ::cub::DeviceFor::ForEachN,