diff --git a/cudax/include/cuda/experimental/__execution/policy.cuh b/cudax/include/cuda/experimental/__execution/policy.cuh index d004dfa06a5..d7296024d8e 100644 --- a/cudax/include/cuda/experimental/__execution/policy.cuh +++ b/cudax/include/cuda/experimental/__execution/policy.cuh @@ -21,6 +21,8 @@ # pragma system_header #endif // no system header +#include +#include #include #include #include @@ -45,8 +47,10 @@ struct any_execution_policy _CCCL_HIDE_FROM_ABI any_execution_policy() = default; template - _CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__execution_policy_base<_Policy>) noexcept + _CCCL_HOST_API constexpr any_execution_policy( + const ::cuda::std::execution::__execution_policy_base<_Policy>& __pol) noexcept : value(value_type{_Policy}) + , stream(::cuda::std::execution::__query_or(__pol, ::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread})) {} _CCCL_HOST_API constexpr operator __execution_policy() const noexcept @@ -89,7 +93,15 @@ struct any_execution_policy } #endif // _CCCL_STD_VER <= 2017 - __execution_policy value = __execution_policy::__invalid_execution_policy; +#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) + [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref get_stream() const noexcept + { + return stream; + } +#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) + + __execution_policy value = __execution_policy::__invalid_execution_policy; + ::cuda::stream_ref stream = ::cuda::stream_ref{cudaStreamPerThread}; }; struct get_execution_policy_t; diff --git a/libcudacxx/include/cuda/__execution/policy.h b/libcudacxx/include/cuda/__execution/policy.h index 3613a4099e9..ce09b0d8b22 100644 --- a/libcudacxx/include/cuda/__execution/policy.h +++ b/libcudacxx/include/cuda/__execution/policy.h @@ -23,17 +23,204 @@ #if _CCCL_HAS_BACKEND_CUDA() # include +# include +# include +# include +# include +# include # include # include +# include # include _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION +template +struct __policy_stream_holder +{ + ::cuda::stream_ref __stream_; + + _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref __stream) noexcept + : __stream_(__stream) + {} +}; + +template <> +struct __policy_stream_holder +{ + _CCCL_HIDE_FROM_ABI __policy_stream_holder() = default; + + //! @brief Dummy constructor to simplify implementation of the cuda policy + _CCCL_HOST_API constexpr __policy_stream_holder(::cuda::stream_ref) noexcept {} +}; + +template +struct __policy_memory_resource_holder +{ + using __resource_t = ::cuda::mr::any_resource<::cuda::mr::device_accessible>; + + __resource_t __resource_; + + _CCCL_TEMPLATE(class _Resource) + _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>) + _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&& __resource) noexcept + : __resource_(::cuda::std::forward<_Resource>(__resource)) + {} +}; + +template <> +struct __policy_memory_resource_holder +{ + _CCCL_HIDE_FROM_ABI __policy_memory_resource_holder() = default; + + //! @brief Dummy constructor to simplify implementation of the cuda policy + _CCCL_TEMPLATE(class _Resource) + _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible>) + _CCCL_HOST_API constexpr __policy_memory_resource_holder(_Resource&&) noexcept {} +}; + template struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda> : __execution_policy_base<_Policy, __execution_backend::__none> -{}; + , protected __policy_stream_holder<__cuda_policy_with_stream<_Policy>> + , protected __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>> +{ +private: + template + friend struct __execution_policy_base; + + using __stream_holder = __policy_stream_holder<__cuda_policy_with_stream<_Policy>>; + using __resource_holder = __policy_memory_resource_holder<__cuda_policy_with_memory_resource<_Policy>>; + + template + _CCCL_HOST_API constexpr __execution_policy_base( + const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy) noexcept + : __stream_holder(__policy.query(::cuda::get_stream)) + , __resource_holder(__policy.query(::cuda::mr::get_memory_resource)) + {} + + template + _CCCL_HOST_API constexpr __execution_policy_base( + const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy, + ::cuda::stream_ref __stream) noexcept + : __stream_holder(__stream) + , __resource_holder(__policy.query(::cuda::mr::get_memory_resource)) + {} + + template + _CCCL_HOST_API constexpr __execution_policy_base( + const __execution_policy_base<_OtherPolicy, __execution_backend::__cuda>& __policy, _Resource&& __resource) noexcept + : __stream_holder(__policy.query(::cuda::get_stream)) + , __resource_holder(::cuda::std::forward<_Resource>(__resource)) + {} + +public: + _CCCL_HIDE_FROM_ABI constexpr __execution_policy_base() noexcept = default; + + //! @brief Convert to a policy that holds a stream + //! @note This cannot be merged with the other case where we already have a stream as this needs to be const qualified + _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>) + _CCCL_REQUIRES((!_WithStream)) + [[nodiscard]] _CCCL_HOST_API auto set_stream(::cuda::stream_ref __stream) const noexcept + { + constexpr uint32_t __new_policy = __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_stream>; + return __execution_policy_base<__new_policy>{*this, __stream}; + } + + //! @brief Set the current stream + _CCCL_TEMPLATE(bool _WithStream = __cuda_policy_with_stream<_Policy>) + _CCCL_REQUIRES(_WithStream) + [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_stream(::cuda::stream_ref __stream) noexcept + { + this->__stream_ = __stream; + return *this; + } + + //! @brief Return the stream stored in the holder or a default stream + [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref query(const ::cuda::get_stream_t&) const noexcept + { + if constexpr (__cuda_policy_with_stream<_Policy>) + { + return this->__stream_; + } + else + { + return ::cuda::stream_ref{cudaStreamPerThread}; + } + } + + //! @brief Set the current memory resource + _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>) + _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND _WithResource) + [[nodiscard]] _CCCL_HOST_API __execution_policy_base& set_memory_resource(_Resource&& __resource) noexcept + { + this->__resource_ = __resource; + return *this; + } + + //! @brief Convert to a policy that holds a memory resource + _CCCL_TEMPLATE(class _Resource, bool _WithResource = __cuda_policy_with_memory_resource<_Policy>) + _CCCL_REQUIRES(::cuda::mr::resource_with<_Resource, ::cuda::mr::device_accessible> _CCCL_AND(!_WithResource)) + [[nodiscard]] _CCCL_HOST_API auto set_memory_resource(_Resource&& __resource) const noexcept + { + constexpr uint32_t __new_policy = + __set_cuda_backend_option<_Policy, __cuda_backend_options::__with_memory_resource>; + return __execution_policy_base<__new_policy>{*this, __resource}; + } + + //! @brief Return either a stored or a default memory resource + //! @note We cannot put that into the __policy_memory_resource_holder because we need a stream for the device + [[nodiscard]] _CCCL_HOST_API auto query(const ::cuda::mr::get_memory_resource_t&) const noexcept + { + if constexpr (__cuda_policy_with_memory_resource<_Policy>) + { + return this->__resource_; + } + else + { + ::cuda::stream_ref __stream = this->query(::cuda::get_stream); + return ::cuda::device_default_memory_pool(__stream.device()); + } + } + + template + [[nodiscard]] _CCCL_API friend constexpr bool operator==( + const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept + { + if constexpr (_Policy != _OtherPolicy) + { + return false; + } + + if constexpr (__cuda_policy_with_stream<_Policy>) + { + if (__lhs.query(::cuda::get_stream) != __rhs.query(::cuda::get_stream)) + { + return false; + } + } + + if constexpr (__cuda_policy_with_memory_resource<_Policy>) + { + if (__lhs.query(::cuda::mr::get_memory_resource) != __rhs.query(::cuda::mr::get_memory_resource)) + { + return false; + } + } + + return true; + } + +# if _CCCL_STD_VER <= 2017 + template + [[nodiscard]] _CCCL_API friend constexpr bool operator!=( + const __execution_policy_base& __lhs, const __execution_policy_base<_OtherPolicy, _OtherBackend>& __rhs) noexcept + { + return !(__lhs == __rhs); + } +# endif // _CCCL_STD_VER <= 2017 +}; _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION diff --git a/libcudacxx/include/cuda/__fwd/execution_policy.h b/libcudacxx/include/cuda/__fwd/execution_policy.h index d0928b10291..749ad640def 100644 --- a/libcudacxx/include/cuda/__fwd/execution_policy.h +++ b/libcudacxx/include/cuda/__fwd/execution_policy.h @@ -28,6 +28,12 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION +enum __cuda_backend_options : uint16_t +{ + __with_stream = 1 << 0, ///> Determines whether the policy holds a stream + __with_memory_resource = 1 << 1, ///> Determines whether the policy holds a memory resource +}; + //! @brief Sets the execution backend to cuda template [[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept @@ -38,6 +44,26 @@ template return __new_policy; } +//! @brief Backend specific options of the CUDA backend +template +inline constexpr __cuda_backend_options __policy_to_cuda_backend_options = + static_cast<__cuda_backend_options>((_Policy & uint32_t{0xFFFF0000}) >> 16); + +//! @brief Sets a backend specific option +template +inline constexpr uint32_t __set_cuda_backend_option = + _Policy | static_cast(static_cast(__option) << 16); + +//! @brief Detects whether a given policy holds a user provided stream +template +inline constexpr bool __cuda_policy_with_stream = + __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_stream; + +//! @brief Detects whether a given policy holds a user provided memory resource +template +inline constexpr bool __cuda_policy_with_memory_resource = + __policy_to_cuda_backend_options<_Policy> & __cuda_backend_options::__with_memory_resource; + _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION # include diff --git a/libcudacxx/include/cuda/std/__numeric/reduce.h b/libcudacxx/include/cuda/std/__numeric/reduce.h index b032d4d779b..0db77794403 100644 --- a/libcudacxx/include/cuda/std/__numeric/reduce.h +++ b/libcudacxx/include/cuda/std/__numeric/reduce.h @@ -30,6 +30,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD +_CCCL_EXEC_CHECK_DISABLE template [[nodiscard]] _CCCL_API constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b) { diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h index 78a95b5e221..9007bcbb00f 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h @@ -27,10 +27,12 @@ # include # include +# include # include # include # include # include +# include # include # include # include @@ -54,7 +56,8 @@ struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cu __par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept { const auto __count = ::cuda::std::__convert_to_integral(__orig_n); - ::cuda::stream_ref __stream{cudaStreamPerThread}; + + auto __stream = __policy.query(::cuda::get_stream); _CCCL_TRY_CUDA_API( ::cub::DeviceFor::ForEachN, diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h new file mode 100644 index 00000000000..482ded14c83 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/reduce.h @@ -0,0 +1,184 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_REDUCE_H +#define _CUDA_STD___PSTL_CUDA_REDUCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow") + +# include + +_CCCL_DIAG_POP + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda> +{ + //! Ensures we properly deallocate the memory allocated for the result + template + struct __allocation_guard + { + //! This helper struct ensures that we can properly assign types with a nontrivial assignment operator + struct __construct_result + { + _Tp* __ptr_; + + _CCCL_HOST_API __construct_result(_Tp* __ptr = nullptr) noexcept + : __ptr_(__ptr) + {} + + template + _CCCL_DEVICE_API void operator()(_Index, _Up&& __value) + { + ::cuda::std::__construct_at(__ptr_, ::cuda::std::forward<_Up>(__value)); + } + }; + + ::cuda::stream_ref __stream_; + _Resource& __resource_; + _Tp* __ptr_; + + _CCCL_HOST_API __allocation_guard(::cuda::stream_ref __stream, _Resource& __resource) + : __stream_(__stream) + , __resource_(__resource) + , __ptr_(static_cast<_Tp*>(__resource_.allocate(__stream_, sizeof(_Tp), alignof(_Tp)))) + {} + + _CCCL_HOST_API ~__allocation_guard() + { + __resource_.deallocate(__stream_, __ptr_, sizeof(_Tp), alignof(_Tp)); + __stream_.sync(); + } + + [[nodiscard]] _CCCL_HOST_API auto __get_result_iter() + { + return ::cuda::tabulate_output_iterator{__construct_result{__ptr_}}; + } + }; + + template + [[nodiscard]] _CCCL_HOST_API static _Tp + __par_impl(_Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) + { + _Tp __ret; + + { + // Allocate memory for result + auto __stream = __policy.query(::cuda::get_stream); + auto __resource = __policy.query(::cuda::mr::get_memory_resource); + __allocation_guard<_Tp, decltype(__resource)> __guard{__stream, __resource}; + + const auto __count = ::cuda::std::distance(__first, __last); + _CCCL_TRY_CUDA_API( + ::cub::DeviceReduce::Reduce, + "__pstl_cuda_reduce: cub::DeviceReduce::Reduce failed", + ::cuda::std::move(__first), + __guard.__get_result_iter(), + __count, + ::cuda::std::move(__func), + ::cuda::std::move(__init), + ::cuda::std::move(__policy)); + + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "__pstl_cuda_reduce: copy of result from device to host failed", + ::cuda::std::addressof(__ret), + __guard.__ptr_, + sizeof(_Tp), + ::cudaMemcpyDeviceToHost, + __stream.get()); + } + + return __ret; + } + + template + [[nodiscard]] _CCCL_HOST_API _Tp + operator()([[maybe_unused]] _Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const + { + if constexpr (::cuda::std::__has_random_access_traversal<_Iter>) + { + try + { + return __par_impl( + ::cuda::std::move(__policy), + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__init), + ::cuda::std::move(__func)); + } + catch (const ::cuda::cuda_error& __err) + { + if (__err.status() == cudaErrorMemoryAllocation) + { + ::cuda::std::__throw_bad_alloc(); + } + else + { + throw __err; + } + } + } + else + { + static_assert(__always_false_v<_Policy>, + "__pstl_dispatch: CUDA backend of cuda::std::reduce requires at least random access iterators"); + return ::cuda::std::reduce( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif /// _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_REDUCE_H diff --git a/libcudacxx/include/cuda/std/__pstl/reduce.h b/libcudacxx/include/cuda/std/__pstl/reduce.h new file mode 100644 index 00000000000..467e47108d5 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/reduce.h @@ -0,0 +1,115 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_REDUCE_H +#define _CUDA_STD___PSTL_REDUCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +template +_CCCL_CONCEPT __indirect_binary_function = _CCCL_REQUIRES_EXPR((_Iter, _Tp, _BinaryOp))( + requires(is_convertible_v, _Tp>, _Tp>), + requires(is_convertible_v>, _Tp>), + requires(is_convertible_v, _Tp>), + requires(is_convertible_v, iter_reference_t<_Iter>>, _Tp>)); + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp, class _BinaryOp) +_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>) +[[nodiscard]] _CCCL_HOST_API _Tp +reduce([[maybe_unused]] _Policy __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) +{ + static_assert(__indirect_binary_function<_Iter, _Tp, _BinaryOp>, + "cuda::std::reduce: The return value of BinaryOp is not convertible to T."); + static_assert(is_move_constructible_v<_Tp>, "cuda::std::reduce: T must be move constructible."); + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + return __dispatch( + __policy, + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__init), + ::cuda::std::move(__func)); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::reduce requires at least one selected backend"); + return ::cuda::std::reduce( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func)); + } +} + +_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp) +_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>) +[[nodiscard]] _CCCL_HOST_API _Tp reduce(_Policy __policy, _Iter __first, _Iter __last, _Tp __init) +{ + return ::cuda::std::reduce( + ::cuda::std::move(__policy), + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__init), + ::cuda::std::plus<>{}); +} + +_CCCL_TEMPLATE(class _Policy, class _Iter) +_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>) +[[nodiscard]] _CCCL_HOST_API iter_value_t<_Iter> reduce(_Policy __policy, _Iter __first, _Iter __last) +{ + return ::cuda::std::reduce( + ::cuda::std::move(__policy), + ::cuda::std::move(__first), + ::cuda::std::move(__last), + iter_value_t<_Iter>{}, + ::cuda::std::plus<>{}); +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_REDUCE_H diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt index 790f3648995..50b6c1c3b83 100644 --- a/libcudacxx/test/libcudacxx/CMakeLists.txt +++ b/libcudacxx/test/libcudacxx/CMakeLists.txt @@ -22,7 +22,9 @@ function(libcudacxx_add_test target_name_var source) cccl_configure_target(${target_name} DIALECT ${CMAKE_CUDA_STANDARD}) target_include_directories( ${target_name} - PRIVATE "${libcudacxx_SOURCE_DIR}/test/libcudacxx/cuda/ccclrt/common" + PRIVATE + "${libcudacxx_SOURCE_DIR}/test/libcudacxx/cuda/ccclrt/common" + "${libcudacxx_SOURCE_DIR}/test/support" ) target_link_libraries( ${target_name} diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp new file mode 100644 index 00000000000..b646f4e2a43 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp @@ -0,0 +1,115 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include + +struct test_resource +{ + __host__ __device__ void* allocate_sync(std::size_t, std::size_t) + { + return nullptr; + } + + __host__ __device__ void deallocate_sync(void* ptr, std::size_t, std::size_t) noexcept + { + // ensure that we did get the right inputs forwarded + _val = *static_cast(ptr); + } + + __host__ __device__ void* allocate(cuda::stream_ref, std::size_t, std::size_t) + { + return &_val; + } + + __host__ __device__ void deallocate(cuda::stream_ref, void* ptr, std::size_t, std::size_t) + { + // ensure that we did get the right inputs forwarded + _val = *static_cast(ptr); + } + + __host__ __device__ bool operator==(const test_resource& other) const + { + return _val == other._val; + } + __host__ __device__ bool operator!=(const test_resource& other) const + { + return _val != other._val; + } + + friend constexpr void get_property(const test_resource&, ::cuda::mr::device_accessible) noexcept {} + + int _val = 0; +}; + +template +void test(Policy pol) +{ + auto old_stream = ::cuda::get_stream(pol); + { // Ensure that the plain policy returns a well defined memory resource + auto expected_resource = ::cuda::device_default_memory_pool(cuda::device_ref{0}); + assert(cuda::mr::get_memory_resource(pol) == expected_resource); + } + + { // Ensure that we can attach a memory resource to an execution policy + test_resource resource{42}; + auto pol_with_resource = pol.set_memory_resource(resource); + assert(cuda::mr::get_memory_resource(pol_with_resource) == resource); + assert(cuda::get_stream(pol_with_resource) == old_stream); + + using policy_t = decltype(pol_with_resource); + static_assert(noexcept(pol.set_memory_resource(resource))); + static_assert(cuda::std::is_execution_policy_v); + } + + { // Ensure that attaching a memory resource multiple times just overwrites the old one + test_resource resource{42}; + auto pol_with_resource = pol.set_memory_resource(resource); + assert(cuda::mr::get_memory_resource(pol_with_resource) == resource); + assert(cuda::get_stream(pol_with_resource) == old_stream); + + using policy_t = decltype(pol_with_resource); + test_resource other_resource{1337}; + decltype(auto) pol_with_other_resource = pol_with_resource.set_memory_resource(other_resource); + static_assert(cuda::std::is_same_v); + assert(::cuda::mr::get_memory_resource(pol_with_resource) == other_resource); + assert(::cuda::mr::get_memory_resource(pol_with_other_resource) == other_resource); + assert(cuda::std::addressof(pol_with_resource) == cuda::std::addressof(pol_with_other_resource)); + assert(cuda::get_stream(pol_with_resource) == old_stream); + } +} + +void test() +{ + namespace execution = cuda::std::execution; + static_assert(!execution::__queryable_with); + static_assert(!execution::__queryable_with); + static_assert( + !execution::__queryable_with); + static_assert(!execution::__queryable_with); + + test(cuda::execution::__cub_par_unseq); + + // Ensure that all works even if we have a stream attached + test(cuda::execution::__cub_par_unseq.set_stream(::cuda::stream{cuda::device_ref{0}})); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp new file mode 100644 index 00000000000..bb8eae08be6 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include + +#include + +template +void test(Policy pol) +{ + { // Ensure that the plain policy returns a well defined stream + cuda::stream_ref expected_stream{cudaStreamPerThread}; + assert(cuda::get_stream(pol) == expected_stream); + } + + { // Ensure that we can attach a stream to an execution policy + cuda::stream stream{cuda::device_ref{0}}; + auto pol_with_stream = pol.set_stream(stream); + assert(cuda::get_stream(pol_with_stream) == stream); + + using stream_policy_t = decltype(pol_with_stream); + static_assert(noexcept(pol.set_stream(stream))); + static_assert(cuda::std::is_execution_policy_v); + } + + { // Ensure that attaching a stream multiple times just overwrites the old stream + cuda::stream stream{cuda::device_ref{0}}; + auto pol_with_stream = pol.set_stream(stream); + assert(cuda::get_stream(pol_with_stream) == stream); + + using stream_policy_t = decltype(pol_with_stream); + cuda::stream other_stream{cuda::device_ref{0}}; + decltype(auto) pol_with_other_stream = pol_with_stream.set_stream(other_stream); + static_assert(cuda::std::is_same_v); + assert(::cuda::get_stream(pol_with_stream) == other_stream); + assert(::cuda::get_stream(pol_with_other_stream) == other_stream); + assert(cuda::std::addressof(pol_with_stream) == cuda::std::addressof(pol_with_other_stream)); + } +} + +void test() +{ + namespace execution = cuda::std::execution; + static_assert(!execution::__queryable_with); + static_assert(!execution::__queryable_with); + static_assert(!execution::__queryable_with); + static_assert(!execution::__queryable_with); + + test(cuda::execution::__cub_par_unseq); + + // Ensure that all works even if we have a memory resource + test(cuda::execution::__cub_par_unseq.set_memory_resource(::cuda::device_default_memory_pool(::cuda::device_ref{0}))); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu index 286324f2ec7..561cf5c42f0 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -40,10 +41,24 @@ struct mark_present_for_each C2H_TEST("cuda::std::for_each", "[parallel algorithm]") { - thrust::device_vector res(size, false); - mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + SECTION("with default stream") + { + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq; + cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + } - const auto policy = cuda::execution::__cub_par_unseq; - cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn); - CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + SECTION("with unique stream") + { + ::cuda::stream stream{::cuda::device_ref{0}}; + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream); + cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + } } diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu index 60f17d59eaf..5346f210b1f 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -40,10 +41,24 @@ struct mark_present_for_each C2H_TEST("cuda::std::for_each_n", "[parallel algorithm]") { - thrust::device_vector res(size, false); - mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + SECTION("with default stream") + { + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq; + cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + } - const auto policy = cuda::execution::__cub_par_unseq; - cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn); - CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + SECTION("with unique stream") + { + ::cuda::stream stream{::cuda::device_ref{0}}; + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream); + cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); + } } diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp new file mode 100644 index 00000000000..5859d0e94f1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp @@ -0,0 +1,175 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc +// XFAIL: true + +// template +// typename iterator_traits::value_type +// reduce(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last); +// template +// T reduce(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last, T init, +// BinaryOperation binary_op); + +#include +#include +#include +#include + +#include "test_execution_policies.h" +#include "test_iterators.h" +#include "test_macros.h" + +EXECUTION_POLICY_SFINAE_TEST(reduce); + +static_assert(!sfinae_test_reduce); +static_assert(sfinae_test_reduce); + +static_assert(!sfinae_test_reduce); +static_assert(sfinae_test_reduce); + +static_assert(!sfinae_test_reduce); +static_assert(sfinae_test_reduce); + +class MoveOnly +{ + int data_; + +public: + __host__ __device__ constexpr MoveOnly(int data = 1) + : data_(data) + {} + + MoveOnly(const MoveOnly&) = delete; + MoveOnly& operator=(const MoveOnly&) = delete; + + __host__ __device__ constexpr MoveOnly(MoveOnly&& x) + : data_(x.data_) + { + x.data_ = 0; + } + __host__ __device__ constexpr MoveOnly& operator=(MoveOnly&& x) + { + data_ = x.data_; + x.data_ = 0; + return *this; + } + + __host__ __device__ constexpr int get() const + { + return data_; + } + + __host__ __device__ friend constexpr bool operator==(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ == y.data_; + } + __host__ __device__ friend constexpr bool operator!=(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ != y.data_; + } + __host__ __device__ friend constexpr bool operator<(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ < y.data_; + } + __host__ __device__ friend constexpr bool operator<=(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ <= y.data_; + } + __host__ __device__ friend constexpr bool operator>(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ > y.data_; + } + __host__ __device__ friend constexpr bool operator>=(const MoveOnly& x, const MoveOnly& y) + { + return x.data_ >= y.data_; + } + +#if TEST_STD_VER > 2017 && _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() + __host__ __device__ friend constexpr auto operator<=>(const MoveOnly&, const MoveOnly&) = default; +#endif // TEST_STD_VER > 2017 && _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() + + __host__ __device__ constexpr MoveOnly operator+(const MoveOnly& x) const + { + return MoveOnly(data_ + x.data_); + } + __host__ __device__ constexpr MoveOnly operator+(const int x) const + { + return MoveOnly(data_ + x); + } + __host__ __device__ constexpr MoveOnly operator*(const MoveOnly& x) const + { + return MoveOnly(data_ * x.data_); + } + + __host__ __device__ constexpr operator int() const noexcept + { + return data_; + } + + template + void operator,(T const&) = delete; +}; +static_assert(cuda::std::is_convertible_v); + +constexpr int max_size = 350; +int data[max_size]; + +template +struct Test +{ + template + void operator()(Policy&& policy) + { + const cuda::std::pair runs[] = {{0, 34}, {1, 36}, {2, 39}, {100, 5184}, {max_size, 61809}}; + for (const auto& pair : runs) + { + auto [size, expected] = pair; + + { + decltype(auto) ret = + cuda::std::reduce(policy, Iter(data), Iter(data + size), ValueT(34), [](ValueT i, ValueT j) -> ValueT { + return i + j + ValueT{2}; + }); + static_assert(cuda::std::is_same_v); + assert(ret == ValueT{expected}); + } + { + decltype(auto) ret = cuda::std::reduce(policy, Iter(data), Iter(data + size), ValueT(34)); + static_assert(cuda::std::is_same_v); + assert(ret == ValueT{expected - 2 * size}); + } + { + decltype(auto) ret = cuda::std::reduce(policy, Iter(data), Iter(data + size)); + static_assert(cuda::std::is_same_v::value_type>); + assert(ret == expected - 2 * size - 34); + } + } + } +}; + +__host__ void test() +{ + cuda::std::iota(data, data + max_size, 0); + types::for_each(types::forward_iterator_list{}, types::apply_type_identity{[](auto v) { + using Iter = typename decltype(v)::type; + types::for_each( + types::type_list{}, + TestIteratorWithPolicies::template apply>{}); + }}); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, test();) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu new file mode 100644 index 00000000000..27604db19b8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/reduce/pstl_reduce.cu @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// typename iterator_traits::value_type +// reduce(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last); +// template +// T reduce(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last, T init, +// BinaryOperation binary_op); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "test_macros.h" + +inline constexpr int size = 100; + +C2H_TEST("cuda::std::reduce(Iter, Iter)", "[parallel algorithm]") +{ + SECTION("with default stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + const auto policy = cuda::execution::__cub_par_unseq; + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end()); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2; + CHECK(res == expected); + } + + SECTION("with provided stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + ::cuda::stream stream{::cuda::device_ref{0}}; + const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream); + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end()); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2; + CHECK(res == expected); + } +} + +C2H_TEST("cuda::std::reduce(Iter, Iter, Tp)", "[parallel algorithm]") +{ + SECTION("with default stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + const auto policy = cuda::execution::__cub_par_unseq; + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2 + 42; + CHECK(res == expected); + } + + SECTION("with provided stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + ::cuda::stream stream{::cuda::device_ref{0}}; + const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream); + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2 + 42; + CHECK(res == expected); + } +} + +struct plus_two +{ + __host__ __device__ constexpr int operator()(const int lhs, const int rhs) const noexcept + { + return lhs + rhs + 2; + }; +}; + +C2H_TEST("cuda::std::reduce(Iter, Iter, Tp, Fn)", "[parallel algorithm]") +{ + SECTION("with default stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + const auto policy = cuda::execution::__cub_par_unseq; + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42, plus_two{}); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2 + 42 + size * 2; + CHECK(res == expected); + } + + SECTION("with provided stream") + { + thrust::device_vector data(size); + thrust::sequence(data.begin(), data.end(), 1); + + ::cuda::stream stream{::cuda::device_ref{0}}; + const auto policy = cuda::execution::__cub_par_unseq.set_stream(stream); + decltype(auto) res = cuda::std::reduce(policy, data.begin(), data.end(), 42, plus_two{}); +#if !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + static_assert(cuda::std::is_same_v); +#endif // !TEST_CUDA_COMPILER(NVCC, <, 12, 5) + + constexpr int expected = size * (size + 1) / 2 + 42 + size * 2; + CHECK(res == expected); + } +} diff --git a/libcudacxx/test/support/type_algorithms.h b/libcudacxx/test/support/type_algorithms.h index 7f90973ad84..fe23e97c84d 100644 --- a/libcudacxx/test/support/type_algorithms.h +++ b/libcudacxx/test/support/type_algorithms.h @@ -75,10 +75,12 @@ struct apply_type_identity { Func func_; + _CCCL_EXEC_CHECK_DISABLE __host__ __device__ apply_type_identity(Func func) : func_(func) {} + _CCCL_EXEC_CHECK_DISABLE template __host__ __device__ decltype(auto) operator()() const {