diff --git a/cudax/include/cuda/experimental/__execution/policy.cuh b/cudax/include/cuda/experimental/__execution/policy.cuh
index d004dfa06a5..d7296024d8e 100644
--- a/cudax/include/cuda/experimental/__execution/policy.cuh
+++ b/cudax/include/cuda/experimental/__execution/policy.cuh
@@ -21,6 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__execution/policy.h>
@@ -45,8 +47,10 @@ struct any_execution_policy
   _CCCL_HIDE_FROM_ABI any_execution_policy() = default;
 
   template <uint32_t _Policy>
-  _CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__execution_policy_base<_Policy>) noexcept
+  _CCCL_HOST_API constexpr any_execution_policy(
+    const ::cuda::std::execution::__execution_policy_base<_Policy>& __pol) noexcept
       : value(value_type{_Policy})
+      , stream(::cuda::std::execution::__query_or(__pol, ::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}))
   {}
 
   _CCCL_HOST_API constexpr operator __execution_policy() const noexcept
@@ -89,7 +93,15 @@ struct any_execution_policy
   }
 #endif // _CCCL_STD_VER <= 2017
 
-  __execution_policy value = __execution_policy::__invalid_execution_policy;
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+  [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref get_stream() const noexcept
+  {
+    return stream;
+  }
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+  __execution_policy value  = __execution_policy::__invalid_execution_policy;
+  ::cuda::stream_ref stream = ::cuda::stream_ref{cudaStreamPerThread};
 };
 
 struct get_execution_policy_t;
diff --git a/libcudacxx/include/cuda/__execution/policy.h b/libcudacxx/include/cuda/__execution/policy.h
index 3613a4099e9..ce09b0d8b22 100644
--- a/libcudacxx/include/cuda/__execution/policy.h
+++ b/libcudacxx/include/cuda/__execution/policy.h
@@ -23,17 +23,204 @@
 #if _CCCL_HAS_BACKEND_CUDA()
 
 #  include <cuda/__fwd/execution_policy.h>
+#  include <cuda/__memory_resource/device_memory_pool.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__memory_resource/resource.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__execution/policy.h>
 #  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/forward.h>
 
 #  include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
+template <bool _HasStream>
+struct __policy_stream_holder
+{
+  ::cuda::stream_ref __stream_;
+
+  _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref __stream) noexcept
+      : __stream_(__stream)
+  {}
+};
+
+template <>
+struct __policy_stream_holder<false>
+{
+  _CCCL_HIDE_FROM_ABI __policy_stream_holder() = default;
+
+  //! @brief Dummy constructor to simplify implementation of the cuda policy
+  _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref) noexcept {}
+};
+
+template <bool _HasResource>
+struct __policy_memory_resource_holder
+{
+  using __resource_t = ::cuda::mr::any_resource<::cuda::mr::device_accessible>;
+
+  __resource_t __resource_;
+
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&& __resource) noexcept
+      : __resource_(::cuda::std::forward<_Resource>(__resource))
+  {}
+};
+
+template <>
+struct __policy_memory_resource_holder<false>
+{
+  _CCCL_HIDE_FROM_ABI __policy_memory_resource_holder() = default;
+
+  //! @brief Dummy constructor to simplify implementation of the cuda policy
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>)
+  _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&&) noexcept {}
+};
+
 template <uint32_t _Policy>
 struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
     : __execution_policy_base<_Policy, __execution_backend::__none>
-{};
+    , protected __policy_stream_holder<__cuda_policy_with_stream<_Policy>>
+    , protected __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>
+{
+private:
+  template <uint32_t, __execution_backend>
+  friend struct __execution_policy_base;
+
+  using __stream_holder   = __policy_stream_holder<__cuda_policy_with_stream<_Policy>>;
+  using __resource_holder = __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>;
+
+  template <uint32_t _OtherPolicy>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy) noexcept
+      : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
+  {}
+
+  template <uint32_t _OtherPolicy>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy,
+    ::cuda::stream_ref __stream) noexcept
+      : __stream_holder(__stream)
+      , __resource_holder(__policy.query(::cuda::mr::get_memory_resource))
+  {}
+
+  template <uint32_t _OtherPolicy, class _Resource>
+  _CCCL_HOST_API constexpr __execution_policy_base(
+    const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy, _Resource&& __resource) noexcept
+      : __stream_holder(__policy.query(::cuda::get_stream))
+      , __resource_holder(::cuda::std::forward<_Resource>(__resource))
+  {}
+
+public:
+  _CCCL_HIDE_FROM_ABI constexpr __execution_policy_base() noexcept = default;
+
+  //! @brief Convert to a policy that holds a stream
+  //! @note This cannot be merged with the other case where we already have a stream as this needs to be const qualified
+  _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>)
+  _CCCL_REQUIRES((!_WithStream))
+  [[nodiscard]] _CCCL_HOST_API auto set_stream(::cuda::stream_ref __stream) const noexcept
+  {
+    constexpr uint32_t __new_policy = __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_stream>;
+    return __execution_policy_base<__new_policy>{*this, __stream};
+  }
+
+  //! @brief Set the current stream
+  _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>)
+  _CCCL_REQUIRES(_WithStream)
+  [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_stream(::cuda::stream_ref __stream) noexcept
+  {
+    this->__stream_ = __stream;
+    return *this;
+  }
+
+  //! @brief Return the stream stored in the holder or a default stream
+  [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref query(const ::cuda::get_stream_t&) const noexcept
+  {
+    if constexpr (__cuda_policy_with_stream<_Policy>)
+    {
+      return this->__stream_;
+    }
+    else
+    {
+      return ::cuda::stream_ref{cudaStreamPerThread};
+    }
+  }
+
+  //! @brief Set the current memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND _WithResource)
+  [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_memory_resource(_Resource&& __resource) noexcept
+  {
+    this->__resource_ = __resource;
+    return *this;
+  }
+
+  //! @brief Convert to a policy that holds a memory resource
+  _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>)
+  _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND(!_WithResource))
+  [[nodiscard]] _CCCL_HOST_API auto set_memory_resource(_Resource&& __resource) const noexcept
+  {
+    constexpr uint32_t __new_policy =
+      __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_memory_resource>;
+    return __execution_policy_base<__new_policy>{*this, __resource};
+  }
+
+  //! @brief Return either a stored or a default memory resource
+  //! @note We cannot put that into the __policy_memory_resource_holder because we need a stream for the device
+  [[nodiscard]] _CCCL_HOST_API auto query(const ::cuda::mr::get_memory_resource_t&) const noexcept
+  {
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      return this->__resource_;
+    }
+    else
+    {
+      ::cuda::stream_ref __stream = this->query(::cuda::get_stream);
+      return ::cuda::device_default_memory_pool(__stream.device());
+    }
+  }
+
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
+  [[nodiscard]] _CCCL_API friend constexpr bool operator==(
+    const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept
+  {
+    if constexpr (_Policy != _OtherPolicy)
+    {
+      return false;
+    }
+
+    if constexpr (__cuda_policy_with_stream<_Policy>)
+    {
+      if (__lhs.query(::cuda::get_stream) != __rhs.query(::cuda::get_stream))
+      {
+        return false;
+      }
+    }
+
+    if constexpr (__cuda_policy_with_memory_resource<_Policy>)
+    {
+      if (__lhs.query(::cuda::mr::get_memory_resource) != __rhs.query(::cuda::mr::get_memory_resource))
+      {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+#  if _CCCL_STD_VER <= 2017
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
+  [[nodiscard]] _CCCL_API friend constexpr bool operator!=(
+    const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept
+  {
+    return !(__lhs == __rhs);
+  }
+#  endif // _CCCL_STD_VER <= 2017
+};
 
 _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
 
diff --git a/libcudacxx/include/cuda/__fwd/execution_policy.h b/libcudacxx/include/cuda/__fwd/execution_policy.h
index d0928b10291..749ad640def 100644
--- a/libcudacxx/include/cuda/__fwd/execution_policy.h
+++ b/libcudacxx/include/cuda/__fwd/execution_policy.h
@@ -28,6 +28,12 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
+enum __cuda_backend_options : uint16_t
+{
+  __with_stream          = 1 << 0, ///> Determines whether the policy holds a stream
+  __with_memory_resource = 1 << 1, ///> Determines whether the policy holds a memory resource
+};
+
 //! @brief Sets the execution backend to cuda
 template <uint32_t _Policy>
 [[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept
@@ -38,6 +44,26 @@ template <uint32_t _Policy>
   return __new_policy;
 }
 
+//! @brief Backend specific options of the CUDA backend
+template <uint32_t _Policy>
+inline constexpr __cuda_backend_options __policy_to_cuda_backend_options =
+  static_cast<__cuda_backend_options>((_Policy & uint32_t{0xFFFF0000}) >> 16);
+
+//! @brief Sets a backend specific option
+template <uint32_t _Policy, __cuda_backend_options __option>
+inline constexpr uint32_t __set_cuda_backend_option =
+  _Policy | static_cast<uint32_t>(static_cast<uint32_t>(__option) << 16);
+
+//! @brief Detects whether a given policy holds a user provided stream
+template <uint32_t _Policy>
+inline constexpr bool __cuda_policy_with_stream =
+  __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_stream;
+
+//! @brief Detects whether a given policy holds a user provided memory resource
+template <uint32_t _Policy>
+inline constexpr bool __cuda_policy_with_memory_resource =
+  __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_memory_resource;
+
 _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
 
 #  include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/include/cuda/std/__numeric/reduce.h b/libcudacxx/include/cuda/std/__numeric/reduce.h
index b032d4d779b..0db77794403 100644
--- a/libcudacxx/include/cuda/std/__numeric/reduce.h
+++ b/libcudacxx/include/cuda/std/__numeric/reduce.h
@@ -30,6 +30,7 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _InputIterator, class _Tp, class _BinaryOp>
 [[nodiscard]] _CCCL_API constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 {
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h
index 78a95b5e221..9007bcbb00f 100644
--- a/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h
+++ b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h
@@ -27,10 +27,12 @@
 
 #  include <cuda/__execution/policy.h>
 #  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__stream/get_stream.h>
 #  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__algorithm/for_each_n.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__exception/terminate.h>
+#  include <cuda/std/__execution/env.h>
 #  include <cuda/std/__execution/policy.h>
 #  include <cuda/std/__iterator/iterator_traits.h>
 #  include <cuda/std/__pstl/dispatch.h>
@@ -54,7 +56,8 @@ struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cu
   __par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept
   {
     const auto __count = ::cuda::std::__convert_to_integral(__orig_n);
-    ::cuda::stream_ref __stream{cudaStreamPerThread};
+
+    auto __stream = __policy.query(::cuda::get_stream);
 
     _CCCL_TRY_CUDA_API(
       ::cub::DeviceFor::ForEachN,
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h
new file mode 100644
index 00000000000..482ded14c83
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h
@@ -0,0 +1,184 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_REDUCE_H
+#define _CUDA_STD___PSTL_CUDA_REDUCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+
+#  include <cub/device/device_reduce.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__memory/addressof.h>
+#  include <cuda/std/__memory/construct_at.h>
+#  include <cuda/std/__new/bad_alloc.h>
+#  include <cuda/std/__numeric/reduce.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda_runtime.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
+{
+  //! Ensures we properly deallocate the memory allocated for the result
+  template <class _Tp, class _Resource>
+  struct __allocation_guard
+  {
+    //! This helper struct ensures that we can properly assign types with a nontrivial assignment operator
+    struct __construct_result
+    {
+      _Tp* __ptr_;
+
+      _CCCL_HOST_API __construct_result(_Tp* __ptr = nullptr) noexcept
+          : __ptr_(__ptr)
+      {}
+
+      template <class _Index, class _Up>
+      _CCCL_DEVICE_API void operator()(_Index, _Up&& __value)
+      {
+        ::cuda::std::__construct_at(__ptr_, ::cuda::std::forward<_Up>(__value));
+      }
+    };
+
+    ::cuda::stream_ref __stream_;
+    _Resource& __resource_;
+    _Tp* __ptr_;
+
+    _CCCL_HOST_API __allocation_guard(::cuda::stream_ref __stream, _Resource& __resource)
+        : __stream_(__stream)
+        , __resource_(__resource)
+        , __ptr_(static_cast<_Tp*>(__resource_.allocate(__stream_, sizeof(_Tp), alignof(_Tp))))
+    {}
+
+    _CCCL_HOST_API ~__allocation_guard()
+    {
+      __resource_.deallocate(__stream_, __ptr_, sizeof(_Tp), alignof(_Tp));
+      __stream_.sync();
+    }
+
+    [[nodiscard]] _CCCL_HOST_API auto __get_result_iter()
+    {
+      return ::cuda::tabulate_output_iterator{__construct_result{__ptr_}};
+    }
+  };
+
+  template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
+  [[nodiscard]] _CCCL_HOST_API static _Tp
+  __par_impl(_Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func)
+  {
+    _Tp __ret;
+
+    {
+      // Allocate memory for result
+      auto __stream   = __policy.query(::cuda::get_stream);
+      auto __resource = __policy.query(::cuda::mr::get_memory_resource);
+      __allocation_guard<_Tp, decltype(__resource)> __guard{__stream, __resource};
+
+      const auto __count = ::cuda::std::distance(__first, __last);
+      _CCCL_TRY_CUDA_API(
+        ::cub::DeviceReduce::Reduce,
+        "__pstl_cuda_reduce: cub::DeviceReduce::Reduce failed",
+        ::cuda::std::move(__first),
+        __guard.__get_result_iter(),
+        __count,
+        ::cuda::std::move(__func),
+        ::cuda::std::move(__init),
+        ::cuda::std::move(__policy));
+
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemcpyAsync,
+        "__pstl_cuda_reduce: copy of result from device to host failed",
+        ::cuda::std::addressof(__ret),
+        __guard.__ptr_,
+        sizeof(_Tp),
+        ::cudaMemcpyDeviceToHost,
+        __stream.get());
+    }
+
+    return __ret;
+  }
+
+  template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
+  [[nodiscard]] _CCCL_HOST_API _Tp
+  operator()([[maybe_unused]] _Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
+    {
+      try
+      {
+        return __par_impl(
+          ::cuda::std::move(__policy),
+          ::cuda::std::move(__first),
+          ::cuda::std::move(__last),
+          ::cuda::std::move(__init),
+          ::cuda::std::move(__func));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == cudaErrorMemoryAllocation)
+        {
+          ::cuda::std::__throw_bad_alloc();
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>,
+                    "__pstl_dispatch: CUDA backend of cuda::std::reduce requires at least random access iterators");
+      return ::cuda::std::reduce(
+        ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif /// _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_REDUCE_H
diff --git a/libcudacxx/include/cuda/std/__pstl/reduce.h b/libcudacxx/include/cuda/std/__pstl/reduce.h
new file mode 100644
index 00000000000..467e47108d5
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/reduce.h
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_REDUCE_H
+#define _CUDA_STD___PSTL_REDUCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/invoke.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__iterator/readable_traits.h>
+#  include <cuda/std/__numeric/reduce.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_convertible.h>
+#  include <cuda/std/__type_traits/is_move_constructible.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/reduce.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+template <class _Iter, class _Tp, class _BinaryOp>
+_CCCL_CONCEPT __indirect_binary_function = _CCCL_REQUIRES_EXPR((_Iter, _Tp, _BinaryOp))(
+  requires(is_convertible_v<invoke_result_t<_BinaryOp&, iter_reference_t<_Iter>, _Tp>, _Tp>),
+  requires(is_convertible_v<invoke_result_t<_BinaryOp&, _Tp, iter_reference_t<_Iter>>, _Tp>),
+  requires(is_convertible_v<invoke_result_t<_BinaryOp&, _Tp, _Tp>, _Tp>),
+  requires(is_convertible_v<invoke_result_t<_BinaryOp&, iter_reference_t<_Iter>, iter_reference_t<_Iter>>, _Tp>));
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp, class _BinaryOp)
+_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API _Tp
+reduce([[maybe_unused]] _Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func)
+{
+  static_assert(__indirect_binary_function<_Iter, _Tp, _BinaryOp>,
+                "cuda::std::reduce: The return value of BinaryOp is not convertible to T.");
+  static_assert(is_move_constructible_v<_Tp>, "cuda::std::reduce: T must be move constructible.");
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    return __dispatch(
+      __policy,
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__init),
+      ::cuda::std::move(__func));
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::reduce requires at least one selected backend");
+    return ::cuda::std::reduce(
+      ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func));
+  }
+}
+
+_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp)
+_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API _Tp reduce(_Policy __policy, _Iter __first, _Iter __last, _Tp __init)
+{
+  return ::cuda::std::reduce(
+    ::cuda::std::move(__policy),
+    ::cuda::std::move(__first),
+    ::cuda::std::move(__last),
+    ::cuda::std::move(__init),
+    ::cuda::std::plus<>{});
+}
+
+_CCCL_TEMPLATE(class _Policy, class _Iter)
+_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API iter_value_t<_Iter> reduce(_Policy __policy, _Iter __first, _Iter __last)
+{
+  return ::cuda::std::reduce(
+    ::cuda::std::move(__policy),
+    ::cuda::std::move(__first),
+    ::cuda::std::move(__last),
+    iter_value_t<_Iter>{},
+    ::cuda::std::plus<>{});
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_REDUCE_H
diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt
index 790f3648995..50b6c1c3b83 100644
--- a/libcudacxx/test/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/test/libcudacxx/CMakeLists.txt
@@ -22,7 +22,9 @@ function(libcudacxx_add_test target_name_var source)
   cccl_configure_target(${target_name} DIALECT ${CMAKE_CUDA_STANDARD})
   target_include_directories(
     ${target_name}
-    PRIVATE "${libcudacxx_SOURCE_DIR}/test/libcudacxx/cuda/ccclrt/common"
+    PRIVATE
+      "${libcudacxx_SOURCE_DIR}/test/libcudacxx/cuda/ccclrt/common"
+      "${libcudacxx_SOURCE_DIR}/test/support"
   )
   target_link_libraries(
     ${target_name}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp
new file mode 100644
index 00000000000..b646f4e2a43
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/memory_resource>
+#include <cuda/std/__pstl/for_each.h>
+#include <cuda/std/execution>
+#include <cuda/std/memory>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+struct test_resource
+{
+  __host__ __device__ void* allocate_sync(std::size_t, std::size_t)
+  {
+    return nullptr;
+  }
+
+  __host__ __device__ void deallocate_sync(void* ptr, std::size_t, std::size_t) noexcept
+  {
+    // ensure that we did get the right inputs forwarded
+    _val = *static_cast<int*>(ptr);
+  }
+
+  __host__ __device__ void* allocate(cuda::stream_ref, std::size_t, std::size_t)
+  {
+    return &_val;
+  }
+
+  __host__ __device__ void deallocate(cuda::stream_ref, void* ptr, std::size_t, std::size_t)
+  {
+    // ensure that we did get the right inputs forwarded
+    _val = *static_cast<int*>(ptr);
+  }
+
+  __host__ __device__ bool operator==(const test_resource& other) const
+  {
+    return _val == other._val;
+  }
+  __host__ __device__ bool operator!=(const test_resource& other) const
+  {
+    return _val != other._val;
+  }
+
+  friend constexpr void get_property(const test_resource&, ::cuda::mr::device_accessible) noexcept {}
+
+  int _val = 0;
+};
+
+template <class Policy>
+void test(Policy pol)
+{
+  auto old_stream = ::cuda::get_stream(pol);
+  { // Ensure that the plain policy returns a well defined memory resource
+    auto expected_resource = ::cuda::device_default_memory_pool(cuda::device_ref{0});
+    assert(cuda::mr::get_memory_resource(pol) == expected_resource);
+  }
+
+  { // Ensure that we can attach a memory resource to an execution policy
+    test_resource resource{42};
+    auto pol_with_resource = pol.set_memory_resource(resource);
+    assert(cuda::mr::get_memory_resource(pol_with_resource) == resource);
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+
+    using policy_t = decltype(pol_with_resource);
+    static_assert(noexcept(pol.set_memory_resource(resource)));
+    static_assert(cuda::std::is_execution_policy_v<policy_t>);
+  }
+
+  { // Ensure that attaching a memory resource multiple times just overwrites the old one
+    test_resource resource{42};
+    auto pol_with_resource = pol.set_memory_resource(resource);
+    assert(cuda::mr::get_memory_resource(pol_with_resource) == resource);
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+
+    using policy_t = decltype(pol_with_resource);
+    test_resource other_resource{1337};
+    decltype(auto) pol_with_other_resource = pol_with_resource.set_memory_resource(other_resource);
+    static_assert(cuda::std::is_same_v<decltype(pol_with_other_resource), policy_t&>);
+    assert(::cuda::mr::get_memory_resource(pol_with_resource) == other_resource);
+    assert(::cuda::mr::get_memory_resource(pol_with_other_resource) == other_resource);
+    assert(cuda::std::addressof(pol_with_resource) == cuda::std::addressof(pol_with_other_resource));
+    assert(cuda::get_stream(pol_with_resource) == old_stream);
+  }
+}
+
+void test()
+{
+  namespace execution = cuda::std::execution;
+  static_assert(!execution::__queryable_with<execution::sequenced_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(!execution::__queryable_with<execution::parallel_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(
+    !execution::__queryable_with<execution::parallel_unsequenced_policy, ::cuda::mr::get_memory_resource_t>);
+  static_assert(!execution::__queryable_with<execution::unsequenced_policy, ::cuda::mr::get_memory_resource_t>);
+
+  test(cuda::execution::__cub_par_unseq);
+
+  // Ensure that all works even if we have a stream attached
+  test(cuda::execution::__cub_par_unseq.set_stream(::cuda::stream{cuda::device_ref{0}}));
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp
new file mode 100644
index 00000000000..bb8eae08be6
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/memory_resource>
+#include <cuda/std/__pstl/for_each.h>
+#include <cuda/std/execution>
+#include <cuda/std/memory>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+#include <nv/target>
+
+template <class Policy>
+void test(Policy pol)
+{
+  { // Ensure that the plain policy returns a well defined stream
+    cuda::stream_ref expected_stream{cudaStreamPerThread};
+    assert(cuda::get_stream(pol) == expected_stream);
+  }
+
+  { // Ensure that we can attach a stream to an execution policy
+    cuda::stream stream{cuda::device_ref{0}};
+    auto pol_with_stream = pol.set_stream(stream);
+    assert(cuda::get_stream(pol_with_stream) == stream);
+
+    using stream_policy_t = decltype(pol_with_stream);
+    static_assert(noexcept(pol.set_stream(stream)));
+    static_assert(cuda::std::is_execution_policy_v<stream_policy_t>);
+  }
+
+  { // Ensure that attaching a stream multiple times just overwrites the old stream
+    cuda::stream stream{cuda::device_ref{0}};
+    auto pol_with_stream = pol.set_stream(stream);
+    assert(cuda::get_stream(pol_with_stream) == stream);
+
+    using stream_policy_t = decltype(pol_with_stream);
+    cuda::stream other_stream{cuda::device_ref{0}};
+    decltype(auto) pol_with_other_stream = pol_with_stream.set_stream(other_stream);
+    static_assert(cuda::std::is_same_v<decltype(pol_with_other_stream), stream_policy_t&>);
+    assert(::cuda::get_stream(pol_with_stream) == other_stream);
+    assert(::cuda::get_stream(pol_with_other_stream) == other_stream);
+    assert(cuda::std::addressof(pol_with_stream) == cuda::std::addressof(pol_with_other_stream));
+  }
+}
+
+void test()
+{
+  namespace execution = cuda::std::execution;
+  static_assert(!execution::__queryable_with<execution::sequenced_policy, ::cuda::get_stream_t>);
+  static_assert(!execution::__queryable_with<execution::parallel_policy, ::cuda::get_stream_t>);
+  static_assert(!execution::__queryable_with<execution::parallel_unsequenced_policy, ::cuda::get_stream_t>);
+  static_assert(!execution::__queryable_with<execution::unsequenced_policy, ::cuda::get_stream_t>);
+
+  test(cuda::execution::__cub_par_unseq);
+
+  // Ensure that all works even if we have a memory resource
+  test(cuda::execution::__cub_par_unseq.set_memory_resource(::cuda::device_default_memory_pool(::cuda::device_ref{0})));
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu
index 286324f2ec7..561cf5c42f0 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu
@@ -21,6 +21,7 @@
 #include <cuda/std/__pstl/for_each.h>
 #include <cuda/std/execution>
 #include <cuda/std/functional>
+#include <cuda/stream>
 
 #include <testing.cuh>
 #include <utility.cuh>
@@ -40,10 +41,24 @@ struct mark_present_for_each
 
 C2H_TEST("cuda::std::for_each", "[parallel algorithm]")
 {
-  thrust::device_vector<bool> res(size, false);
-  mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+  SECTION("with default stream")
+  {
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    const auto policy = cuda::execution::__cub_par_unseq;
+    cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 
-  const auto policy = cuda::execution::__cub_par_unseq;
-  cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
-  CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  SECTION("with unique stream")
+  {
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream);
+    cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 }
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu
index 60f17d59eaf..5346f210b1f 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu
@@ -21,6 +21,7 @@
 #include <cuda/std/__pstl/for_each_n.h>
 #include <cuda/std/execution>
 #include <cuda/std/functional>
+#include <cuda/stream>
 
 #include <testing.cuh>
 #include <utility.cuh>
@@ -40,10 +41,24 @@ struct mark_present_for_each
 
 C2H_TEST("cuda::std::for_each_n", "[parallel algorithm]")
 {
-  thrust::device_vector<bool> res(size, false);
-  mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+  SECTION("with default stream")
+  {
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    const auto policy = cuda::execution::__cub_par_unseq;
+    cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 
-  const auto policy = cuda::execution::__cub_par_unseq;
-  cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn);
-  CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  SECTION("with unique stream")
+  {
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream);
+    cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 }
diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
new file mode 100644
index 00000000000..5859d0e94f1
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
@@ -0,0 +1,175 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// XFAIL: true
+
+// template<class ExecutionPolicy, class ForwardIterator>
+//   typename iterator_traits<ForwardIterator>::value_type
+//     reduce(ExecutionPolicy&& exec,
+//            ForwardIterator first, ForwardIterator last);
+// template<class ExecutionPolicy, class ForwardIterator, class T, class BinaryOperation>
+//   T reduce(ExecutionPolicy&& exec,
+//            ForwardIterator first, ForwardIterator last, T init,
+//            BinaryOperation binary_op);
+
+#include <cuda/std/__pstl/reduce.h>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/numeric>
+
+#include "test_execution_policies.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+EXECUTION_POLICY_SFINAE_TEST(reduce);
+
+static_assert(!sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*>);
+static_assert(sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*>);
+
+static_assert(!sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*, int>);
+static_assert(sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*, int>);
+
+static_assert(!sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*, int, int (*)(int, int)>);
+static_assert(sfinae_test_reduce<cuda::std::execution::parallel_policy, int*, int*, int, int (*)(int, int)>);
+
+class MoveOnly
+{
+  int data_;
+
+public:
+  __host__ __device__ constexpr MoveOnly(int data = 1)
+      : data_(data)
+  {}
+
+  MoveOnly(const MoveOnly&)            = delete;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+
+  __host__ __device__ constexpr MoveOnly(MoveOnly&& x)
+      : data_(x.data_)
+  {
+    x.data_ = 0;
+  }
+  __host__ __device__ constexpr MoveOnly& operator=(MoveOnly&& x)
+  {
+    data_   = x.data_;
+    x.data_ = 0;
+    return *this;
+  }
+
+  __host__ __device__ constexpr int get() const
+  {
+    return data_;
+  }
+
+  __host__ __device__ friend constexpr bool operator==(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ == y.data_;
+  }
+  __host__ __device__ friend constexpr bool operator!=(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ != y.data_;
+  }
+  __host__ __device__ friend constexpr bool operator<(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ < y.data_;
+  }
+  __host__ __device__ friend constexpr bool operator<=(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ <= y.data_;
+  }
+  __host__ __device__ friend constexpr bool operator>(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ > y.data_;
+  }
+  __host__ __device__ friend constexpr bool operator>=(const MoveOnly& x, const MoveOnly& y)
+  {
+    return x.data_ >= y.data_;
+  }
+
+#if TEST_STD_VER > 2017 && _LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
+  __host__ __device__ friend constexpr auto operator<=>(const MoveOnly&, const MoveOnly&) = default;
+#endif // TEST_STD_VER > 2017 && _LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
+
+  __host__ __device__ constexpr MoveOnly operator+(const MoveOnly& x) const
+  {
+    return MoveOnly(data_ + x.data_);
+  }
+  __host__ __device__ constexpr MoveOnly operator+(const int x) const
+  {
+    return MoveOnly(data_ + x);
+  }
+  __host__ __device__ constexpr MoveOnly operator*(const MoveOnly& x) const
+  {
+    return MoveOnly(data_ * x.data_);
+  }
+
+  __host__ __device__ constexpr operator int() const noexcept
+  {
+    return data_;
+  }
+
+  template <class T>
+  void operator,(T const&) = delete;
+};
+static_assert(cuda::std::is_convertible_v<MoveOnly, MoveOnly>);
+
+constexpr int max_size = 350;
+int data[max_size];
+
+template <class Iter, class ValueT>
+struct Test
+{
+  template <class Policy>
+  void operator()(Policy&& policy)
+  {
+    const cuda::std::pair<int, int> runs[] = {{0, 34}, {1, 36}, {2, 39}, {100, 5184}, {max_size, 61809}};
+    for (const auto& pair : runs)
+    {
+      auto [size, expected] = pair;
+
+      {
+        decltype(auto) ret =
+          cuda::std::reduce(policy, Iter(data), Iter(data + size), ValueT(34), [](ValueT i, ValueT j) -> ValueT {
+            return i + j + ValueT{2};
+          });
+        static_assert(cuda::std::is_same_v<decltype(ret), ValueT>);
+        assert(ret == ValueT{expected});
+      }
+      {
+        decltype(auto) ret = cuda::std::reduce(policy, Iter(data), Iter(data + size), ValueT(34));
+        static_assert(cuda::std::is_same_v<decltype(ret), ValueT>);
+        assert(ret == ValueT{expected - 2 * size});
+      }
+      {
+        decltype(auto) ret = cuda::std::reduce(policy, Iter(data), Iter(data + size));
+        static_assert(cuda::std::is_same_v<decltype(ret), typename cuda::std::iterator_traits<Iter>::value_type>);
+        assert(ret == expected - 2 * size - 34);
+      }
+    }
+  }
+};
+
+__host__ void test()
+{
+  cuda::std::iota(data, data + max_size, 0);
+  types::for_each(types::forward_iterator_list<int*>{}, types::apply_type_identity{[](auto v) {
+                    using Iter = typename decltype(v)::type;
+                    types::for_each(
+                      types::type_list<int, MoveOnly>{},
+                      TestIteratorWithPolicies<types::partial_instantiation<Test, Iter>::template apply>{});
+                  }});
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, test();)
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu
new file mode 100644
index 00000000000..27604db19b8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template<class ExecutionPolicy, class ForwardIterator>
+//   typename iterator_traits<ForwardIterator>::value_type
+//     reduce(ExecutionPolicy&& exec,
+//            ForwardIterator first, ForwardIterator last);
+// template<class ExecutionPolicy, class ForwardIterator, class T, class BinaryOperation>
+//   T reduce(ExecutionPolicy&& exec,
+//            ForwardIterator first, ForwardIterator last, T init,
+//            BinaryOperation binary_op);
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cuda/iterator>
+#include <cuda/std/__pstl/reduce.h>
+#include <cuda/std/execution>
+#include <cuda/std/functional>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+#include "test_macros.h"
+
+inline constexpr int size = 100;
+
+C2H_TEST("cuda::std::reduce(Iter, Iter)", "[parallel algorithm]")
+{
+  SECTION("with default stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    const auto policy  = cuda::execution::__cub_par_unseq;
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end());
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2;
+    CHECK(res == expected);
+  }
+
+  SECTION("with provided stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    const auto policy  = cuda::execution::__cub_par_unseq.set_stream(stream);
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end());
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2;
+    CHECK(res == expected);
+  }
+}
+
+C2H_TEST("cuda::std::reduce(Iter, Iter, Tp)", "[parallel algorithm]")
+{
+  SECTION("with default stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    const auto policy  = cuda::execution::__cub_par_unseq;
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42);
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2 + 42;
+    CHECK(res == expected);
+  }
+
+  SECTION("with provided stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    const auto policy  = cuda::execution::__cub_par_unseq.set_stream(stream);
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42);
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2 + 42;
+    CHECK(res == expected);
+  }
+}
+
+struct plus_two
+{
+  __host__ __device__ constexpr int operator()(const int lhs, const int rhs) const noexcept
+  {
+    return lhs + rhs + 2;
+  };
+};
+
+C2H_TEST("cuda::std::reduce(Iter, Iter, Tp, Fn)", "[parallel algorithm]")
+{
+  SECTION("with default stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    const auto policy  = cuda::execution::__cub_par_unseq;
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42, plus_two{});
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2 + 42 + size * 2;
+    CHECK(res == expected);
+  }
+
+  SECTION("with provided stream")
+  {
+    thrust::device_vector<int> data(size);
+    thrust::sequence(data.begin(), data.end(), 1);
+
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    const auto policy  = cuda::execution::__cub_par_unseq.set_stream(stream);
+    decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42, plus_two{});
+#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+    static_assert(cuda::std::is_same_v<decltype(res), int>);
+#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5)
+
+    constexpr int expected = size * (size + 1) / 2 + 42 + size * 2;
+    CHECK(res == expected);
+  }
+}
diff --git a/libcudacxx/test/support/type_algorithms.h b/libcudacxx/test/support/type_algorithms.h
index 7f90973ad84..fe23e97c84d 100644
--- a/libcudacxx/test/support/type_algorithms.h
+++ b/libcudacxx/test/support/type_algorithms.h
@@ -75,10 +75,12 @@ struct apply_type_identity
 {
   Func func_;
 
+  _CCCL_EXEC_CHECK_DISABLE
   __host__ __device__ apply_type_identity(Func func)
       : func_(func)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... Args>
   __host__ __device__ decltype(auto) operator()() const
   {