diff --git a/cudax/examples/shared_memory_abstractions.cu b/cudax/examples/shared_memory_abstractions.cu new file mode 100644 index 00000000000..894d9634eb4 --- /dev/null +++ b/cudax/examples/shared_memory_abstractions.cu @@ -0,0 +1,136 @@ +#include + +#include + +#include + +namespace cudax = cuda::experimental; + +#define thread_printf(FMT, ...) \ + printf("[%d, %d, %d]: " FMT, threadIdx.x, threadIdx.y, threadIdx.z __VA_OPT__(, ) __VA_ARGS__) + +struct SharedObj +{ + __device__ SharedObj() + { + thread_printf("Default constructing...\n"); + } + + __device__ SharedObj(int v) + : value_{v} + { + thread_printf("Constructing with %d...\n", v); + } + + __device__ void use() + { + thread_printf("Using with value (%d)...\n", value_); + } + + __device__ ~SharedObj() + { + thread_printf("Destructing...\n"); + } + + int value_{0}; +}; + +__device__ void use(cudax::shared_memory_ptr ptr) +{ + ptr->use(); +} + +__global__ void demo1() +{ + // The default constructor is called by thread 0. + cudax::static_shared shared_obj{}; + + // Wait for the construction to complete. + __syncthreads(); + + use(&shared_obj); + + // Wait until all threads are done using the shared object. + __syncthreads(); + + // The object will be destructed by thread 0 when the object goes out of scope. +} + +__global__ void demo2() +{ + cudax::static_shared shared_obj1{1}; + cudax::static_shared shared_obj2{cuda::no_init}; + + // Oops, is uninitialized, would cause assertion to trigger. + // use(&shared_obj2); + + // Construct shared_obj2 by thread 1. + shared_obj2.construct_by({1, 0, 0}, 2); + + // Wait for the construction to complete. + __syncthreads(); + + use(&shared_obj1); + use(&shared_obj2); + + // Wait until all threads are done using the shared object. + __syncthreads(); + + // Manually destroy the shared_obj1 by thread 0. + shared_obj1.destroy(); + + // Manually destroy the shared_obj2 by thread 1. + shared_obj2.destroy_by({1, 0, 0}); + + // Oops, is already destructed, would cause assertion to trigger. + // shared_obj1.destroy(); + + // The destructor will not do anything. +} + +__global__ void demo3() +{ + // Create a shared buffer with 32 bytes of storage and 16 bytes of alignment. + cudax::static_shared_storage<32, 16> shared_buff; + + // Obtain pointer to the shared buffer. + auto ptr = cudax::static_pointer_cast(&shared_buff); + + // Construct SharedObj by thread 0. + if (threadIdx.x == 0) + { + new (ptr.get()) SharedObj{123}; + } + + // Wait for the construction to complete. + __syncthreads(); + + use(ptr); + + // Wait for all threads to complete before destructing the object. + __syncthreads(); + + // Destruct the object by thread 0. + if (threadIdx.x == 0) + { + ptr->~SharedObj(); + } +} + +int main() +{ + printf("Demo1:\n"); + demo1<<<1, 4>>>(); + assert(cudaDeviceSynchronize() == cudaSuccess); + printf("\n"); + + printf("Demo2:\n"); + demo2<<<1, 4>>>(); + assert(cudaDeviceSynchronize() == cudaSuccess); + printf("\n"); + + printf("Demo3:\n"); + demo3<<<1, 4>>>(); + assert(cudaDeviceSynchronize() == cudaSuccess); + printf("\n"); +} diff --git a/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh b/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh new file mode 100644 index 00000000000..9162dae1b23 --- /dev/null +++ b/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh @@ -0,0 +1,267 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H +#define _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental +{ +enum class __smem_addr_t : unsigned +{ +}; + +//! @brief A pointer to shared memory. +template +class shared_memory_ptr +{ + unsigned __smem_addr_; //!< The address of the shared memory. + +public: + using element_type = _Tp; //!< The element type. + using pointer = _Tp*; //!< The pointer type. + + shared_memory_ptr() = delete; + + shared_memory_ptr(::cuda::std::nullptr_t) = delete; + + //! @brief Constructs the object to uninitialized state. + _CCCL_DEVICE_API explicit shared_memory_ptr(::cuda::no_init_t) noexcept {} + + //! @brief Constructs the object from shared memory address. + //! + //! @param __addr The shared memory address. + _CCCL_DEVICE_API explicit shared_memory_ptr(__smem_addr_t __addr) noexcept + : __smem_addr_{static_cast(__addr)} + {} + + //! @brief Constructs the object from shared memory pointer. + //! + //! @param __ptr The shared memory pointer. + _CCCL_DEVICE_API explicit shared_memory_ptr(_Tp* __ptr) noexcept + { + reset(__ptr); + } + + //! @brief Constructs the object from another \c shared_memory_ptr with different element type such that the other + //! \c pointer_type is convertible to this \c pointer_type. + //! + //! @param __other The other \c shared_memory_ptr. + _CCCL_TEMPLATE(class _Up) + _CCCL_REQUIRES(::cuda::std::is_convertible_v<_Up*, _Tp*>) + _CCCL_DEVICE_API shared_memory_ptr(shared_memory_ptr<_Up> __other) noexcept + : __smem_addr_{__other.__smem_addr_} + {} + + _CCCL_HIDE_FROM_ABI shared_memory_ptr(const shared_memory_ptr&) noexcept = default; + + _CCCL_HIDE_FROM_ABI shared_memory_ptr(shared_memory_ptr&&) noexcept = default; + + _CCCL_HIDE_FROM_ABI shared_memory_ptr& operator=(const shared_memory_ptr&) noexcept = default; + + //! @brief Resets the pointer to the given pointer. + //! + //! @param __ptr The pointer to reset to. + //! + //! @returns The previous pointer. + _CCCL_DEVICE_API _Tp* reset(_Tp* __ptr) noexcept + { + _CCCL_ASSERT(::cuda::device::is_address_from(__ptr, ::cuda::device::address_space::shared), + "pointer is not from shared memory"); + _Tp* __ret = get(); + __smem_addr_ = ::__cvta_generic_to_shared(__ptr); + return __ret; + } + + //! @brief Swaps the pointers of two shared_memory_ptrs. + _CCCL_DEVICE_API constexpr void swap(shared_memory_ptr& __other) noexcept + { + ::cuda::std::swap(__smem_addr_, __other.__smem_addr_); + } + + //! @brief Gets the stored address. + //! + //! @returns The stored address. + [[nodiscard]] _CCCL_DEVICE_API __smem_addr_t __get_smem_addr() const noexcept + { + return __smem_addr_t{__smem_addr_}; + } + + //! @brief Gets the stored pointer. + //! + //! @returns The pointer. + [[nodiscard]] _CCCL_DEVICE_API _Tp* get() const noexcept + { + return static_cast<_Tp*>(::__cvta_shared_to_generic(__smem_addr_)); + } + + //! @brief Conversion operator to bool (always returns \c true). + _CCCL_DEVICE_API explicit constexpr operator bool() const noexcept + { + return true; + } + + //! @brief Arrow operator. + //! + //! @return The stored pointer. + [[nodiscard]] _CCCL_DEVICE_API _Tp* operator->() const noexcept + { + return get(); + } + + //! @brief Dereference operator. + //! + //! @return Reference to the object pointed to by the stored pointer. + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES((!::cuda::std::is_void_v<_Tp2>) ) + [[nodiscard]] _CCCL_DEVICE_API _Tp2& operator*() const noexcept + { + return *get(); + } + + //! @brief Conversion operator to \c pointer_type. + //! + //! @returns The stored pointer. + _CCCL_DEVICE_API explicit operator _Tp*() const noexcept + { + return get(); + } + + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator==(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ == __rhs.__smem_addr_; + } + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator!=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ != __rhs.__smem_addr_; + } + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator<(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ < __rhs.__smem_addr_; + } + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator<=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ <= __rhs.__smem_addr_; + } + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator>(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ > __rhs.__smem_addr_; + } + template + [[nodiscard]] _CCCL_DEVICE_API friend bool + operator>=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept + { + return __lhs.__smem_addr_ >= __rhs.__smem_addr_; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator==(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return false; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator!=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator<(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return false; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator<=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return false; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator>(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator>=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator==(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return false; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator!=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator<(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator<=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return true; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator>(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return false; + } + [[nodiscard]] _CCCL_DEVICE_API friend bool operator>=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept + { + return false; + } +}; + +template +_CCCL_HOST_DEVICE shared_memory_ptr(_Tp*) -> shared_memory_ptr<_Tp>; + +// todo: constraints +template +[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> static_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept +{ + return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()}; +} + +// todo: constraints +template +[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> const_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept +{ + return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()}; +} + +// todo: constraints +template +[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> reinterpret_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept +{ + return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()}; +} +} // namespace cuda::experimental + +#endif // _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H diff --git a/cudax/include/cuda/experimental/__memory/static_shared.cuh b/cudax/include/cuda/experimental/__memory/static_shared.cuh new file mode 100644 index 00000000000..71d6ba1fadf --- /dev/null +++ b/cudax/include/cuda/experimental/__memory/static_shared.cuh @@ -0,0 +1,197 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H +#define _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental +{ +//! @brief A RAII wrapper for an object living in static shared memory. +//! +//! @tparam _Tp The type of the object. +//! @tparam _Align The alignment of the object. +template +class [[nodiscard]] static_shared : static_shared_storage +{ + static_assert(!::cuda::std::is_void_v<_Tp>, "_Tp must not be void"); + static_assert(::cuda::is_power_of_two(_Align), "_Align must be power of two"); + static_assert(_Align >= alignof(_Tp), "_Align must be at least alignof(_Tp)"); + + static_assert(!::cuda::std::is_array_v<_Tp>, "Arrays are not supported yet"); + + using __base_type = static_shared_storage; + + enum class __state_type + { + __uninitialized, //!< The object is created but in uninitialized state. + __constructed, //!< The object is created and initialized. + __destroyed, //!< The object is destroyed. + }; + + __state_type __state_{__state_type::__uninitialized}; //!< The state of the object. + + //! @brief Gets the pointer to the object stored in the static shared memory. + //! + //! @return The pointer to the object stored in the static shared memory. + [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> __ptr() const noexcept + { + return shared_memory_ptr<_Tp>{__base_type::get().__get_smem_addr()}; + } + + //! @brief Implements the destruction of the object. No assertions are performed. + //! + //! @param __chosen_thread The thread that will perform the destruction. + _CCCL_DEVICE_API void __destroy_by_impl(::uint3 __chosen_thread) noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>) + { + if (__state_ == __state_type::__constructed) + { + if (__chosen_thread.x == threadIdx.x && __chosen_thread.y == threadIdx.y && __chosen_thread.z == threadIdx.z) + { + ::cuda::std::__destroy_at(__ptr().get()); + } + __state_ = __state_type::__destroyed; + } + } + +public: + //! @brief The default 3D index of the thread used to construct/destroy the object. + static constexpr ::uint3 default_thread_index{0, 0, 0}; + + using value_type = _Tp; //!< The type of the object stored in the static shared memory. + using __base_type::alignment; //!< The alignment of the static shared memory. + using __base_type::size; //!< The size of the static shared memory. + + //! @brief Allocates the static shared memory without constructing the object. The object is expected to be + //! constructed later by calling construct(...)/construct_by(...) methods. + _CCCL_DEVICE_API _CCCL_FORCEINLINE static_shared(cuda::no_init_t) noexcept {} + + //! @brief Allocates the static shared memory and constructs the object. + //! + //! @param __args The arguments to forward to the constructor of the object. + _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args) + _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>) + _CCCL_DEVICE_API _CCCL_FORCEINLINE + static_shared(_Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>) + { + construct(::cuda::std::forward<_Args>(__args)...); + } + + static_shared(const static_shared&) = delete; + + static_shared(static_shared&&) = delete; + + //! @brief Destroys the stored object. + _CCCL_DEVICE_API ~static_shared() noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>) + { + __destroy_by_impl(default_thread_index); + } + + static_shared& operator=(const static_shared&) = delete; + + static_shared& operator=(static_shared&&) = delete; + + //! @brief Constructs the stored object in-place by calling its constructor with the given arguments by the + //! \c default_thread_index thread. + //! + //! @param __args The arguments to forward to the constructor of the stored object. + _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args) + _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>) + _CCCL_DEVICE_API void construct(_Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>) + { + construct_by(default_thread_index, ::cuda::std::forward<_Args>(__args)...); + } + + //! @brief Constructs the stored object in-place by calling its constructor with the given arguments by the thread at + //! the given index. + //! + //! @param __chosen_thread The thread index of the thread that will construct the object. + //! @param __args The arguments to forward to the constructor of the stored object. + _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args) + _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>) + _CCCL_DEVICE_API void construct_by(::uint3 __chosen_thread, + _Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>) + { + _CCCL_ASSERT(__state_ != __state_type::__constructed, "static shared memory object is already constructed"); + _CCCL_ASSERT(__state_ != __state_type::__destroyed, "static shared memory object cannot be reconstructed"); + if (__chosen_thread.x == threadIdx.x && __chosen_thread.y == threadIdx.y && __chosen_thread.z == threadIdx.z) + { + ::cuda::std::__construct_at(__ptr().get(), ::cuda::std::forward<_Args>(__args)...); + } + __state_ = __state_type::__constructed; + } + + //! @brief Destroys the object stored in the static shared memory by the \c default_thread_index thread and + //! invalidates this instance. + _CCCL_DEVICE_API void destroy() noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>) + { + destroy_by(default_thread_index); + } + + //! @brief Destroys the object stored in the static shared memory by the \c __chosen_thread thread and + //! invalidates this instance. + //! + //! @param __chosen_thread The thread that destroys the object. + _CCCL_DEVICE_API void destroy_by(::uint3 __chosen_thread) noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>) + { + _CCCL_ASSERT(__state_ != __state_type::__destroyed, "destroying already destroyed static shared memory object"); + __destroy_by_impl(__chosen_thread); + } + + //! @brief Gets a reference to the object stored in the static shared memory. + //! + //! @returns A reference to the object stored in the static shared memory. + [[nodiscard]] _CCCL_DEVICE_API _Tp& get() const noexcept + { + _CCCL_ASSERT(__state_ != __state_type::__uninitialized, "accessing uninitialized static shared memory object"); + _CCCL_ASSERT(__state_ != __state_type::__destroyed, "accessing destroyed static shared memory object"); + return *__ptr(); + } + + //! @brief Gets a pointer to the object stored in the static shared memory. + //! + //! @returns A pointer to the object stored in the static shared memory. + [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> operator&() const noexcept + { + return __ptr(); + } + + //! @brief Casts the static shared memory object to a reference to the object stored in the static shared memory. + _CCCL_DEVICE_API operator _Tp&() const noexcept + { + return get(); + } +}; +} // namespace cuda::experimental + +#endif // _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H diff --git a/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh b/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh new file mode 100644 index 00000000000..cb6aa895b23 --- /dev/null +++ b/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh @@ -0,0 +1,137 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H +#define _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental +{ +[[nodiscard]] _CCCL_CONSTEVAL _CCCL_DEVICE_API ::cuda::std::size_t __size_t_to_str_size(::cuda::std::size_t __n) noexcept +{ + ::cuda::std::size_t __ret = 0; + while (__n > 0) + { + __n /= 10; + ++__ret; + } + return __ret; +} + +template <::cuda::std::size_t _StrSize> +[[nodiscard]] _CCCL_CONSTEVAL _CCCL_DEVICE_API auto __size_t_to_str(::cuda::std::size_t __n) noexcept +{ + ::cuda::std::array __ret{}; + ::cuda::std::to_chars(__ret.data(), __ret.data() + _StrSize, __n); + return __ret; +} + +template +struct __static_cstr +{ + static constexpr char __value[]{_Cs..., '\0'}; +}; + +template <::cuda::std::size_t _Size, + ::cuda::std::size_t _Align, + ::cuda::std::size_t... _SizeIdx, + ::cuda::std::size_t... _AlignIdx> +[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE unsigned __cccl_alloc_static_shared_impl( + ::cuda::std::integer_sequence<::cuda::std::size_t, _SizeIdx...>, + ::cuda::std::integer_sequence<::cuda::std::size_t, _AlignIdx...>) noexcept +{ + constexpr auto __size_str = __size_t_to_str(_Size); + constexpr auto __align_str = __size_t_to_str(_Align); + + using _SizeCStr = __static_cstr<__size_str[_SizeIdx]...>; + using _AlignCStr = __static_cstr<__align_str[_AlignIdx]...>; + + unsigned __ret; + asm(R"({ + .shared .align %2 .b8 _cccl_static_shared_storage[%1]; + mov.b32 %0, _cccl_static_shared_storage; + })" + : "=r"(__ret) + : "C"(_SizeCStr::__value), "C"(_AlignCStr::__value)); + return __ret; +} + +template <::cuda::std::size_t _Size, ::cuda::std::size_t _Align> +[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE unsigned __cccl_alloc_static_shared() noexcept +{ + return __cccl_alloc_static_shared_impl<_Size, _Align>( + ::cuda::std::make_index_sequence<__size_t_to_str_size(_Size)>{}, + ::cuda::std::make_index_sequence<__size_t_to_str_size(_Align)>{}); +} + +//! @brief Allocates static shared memory with the given size and alignment. +//! +//! @tparam _Size Size of the storage in bytes. +//! @tparam _Align Alignment of the storage in bytes. +template <::cuda::std::size_t _Size, ::cuda::std::size_t _Align> +class [[nodiscard]] static_shared_storage +{ + static_assert(::cuda::is_power_of_two(_Align), "_Align must be power of two"); + + unsigned __smem_addr_; //!< Shared memory address of the storage. + +public: + static constexpr ::cuda::std::size_t size = _Size; //!< Size of the storage. + static constexpr ::cuda::std::size_t alignment = _Align; //!< Alignment of the storage. + + //! @brief Allocates the static shared memory and constructs the handle. + _CCCL_DEVICE_API _CCCL_FORCEINLINE static_shared_storage() noexcept + : __smem_addr_{__cccl_alloc_static_shared<_Size, _Align>()} + {} + + static_shared_storage(const static_shared_storage&) = delete; + + static_shared_storage(static_shared_storage&&) = delete; + + static_shared_storage& operator=(const static_shared_storage&) = delete; + + static_shared_storage& operator=(static_shared_storage&&) = delete; + + //! @brief Obtains the address of the storage. + //! + //! @return The address of the storage. + [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr get() const noexcept + { + return shared_memory_ptr{__smem_addr_t{__smem_addr_}}; + } + + //! @brief Obtains the address of the storage. + //! + //! @return The address of the storage. + [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr operator&() const noexcept + { + return get(); + } +}; +} // namespace cuda::experimental + +#endif // _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H diff --git a/cudax/include/cuda/experimental/memory.cuh b/cudax/include/cuda/experimental/memory.cuh new file mode 100644 index 00000000000..71368a554b4 --- /dev/null +++ b/cudax/include/cuda/experimental/memory.cuh @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL_MEMORY +#define _CUDA_EXPERIMENTAL_MEMORY + +#include +#include +#include + +#endif // _CUDA_EXPERIMENTAL_MEMORY diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 845e5123d5b..174534fd354 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -106,6 +106,10 @@ cudax_add_catch2_test(test_target algorithm algorithm/copy.cu ) +cudax_add_catch2_test(test_target memory_static_shared + memory/static_shared.cu +) + if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes cufile/driver_attributes.cu diff --git a/cudax/test/memory/static_shared.cu b/cudax/test/memory/static_shared.cu new file mode 100644 index 00000000000..6eeb0258ade --- /dev/null +++ b/cudax/test/memory/static_shared.cu @@ -0,0 +1,425 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#undef NDEBUG + +#include +#include +#include +#include + +#include + +#include + +__global__ void test_static_shared_storage() +{ + constexpr cuda::std::size_t size = 32; + constexpr cuda::std::size_t align = 16; + + using SharedStorage = cudax::static_shared_storage; + + // 0. Test static public members. + static_assert(SharedStorage::size == size); + static_assert(SharedStorage::alignment == align); + + // 1. Test that the type is nothrow default constructible. + static_assert(cuda::std::is_nothrow_default_constructible_v); + + // 2. Test that the type is not copyable. + static_assert(!cuda::std::is_copy_constructible_v); + + // 3. Test that the type is not movable. + static_assert(!cuda::std::is_move_constructible_v); + + // 4. Test that the type is trivially destructible. + static_assert(cuda::std::is_trivially_destructible_v); + + // 5. Test that the type is not copy assignable. + static_assert(!cuda::std::is_copy_assignable_v); + + // 6. Test that the type is not move assignable. + static_assert(!cuda::std::is_move_assignable_v); + + // 7. Test that if the type is constructed multiple times, the actual address of the objects is not the same. + { + SharedStorage a; + SharedStorage b; + assert(a.get() != b.get()); + } + + // 8. Test get method. + static_assert( + cuda::std::is_same_v, decltype(cuda::std::declval().get())>); + static_assert(noexcept(cuda::std::declval().get())); + { + SharedStorage a; + assert(&a + == cudax::shared_memory_ptr{::__cvta_shared_to_generic(cuda::std::to_underlying((&a).__get_smem_addr()))}); + } + + // 9. Test operator&. + static_assert( + cuda::std::is_same_v, decltype(&cuda::std::declval())>); + static_assert(noexcept(&cuda::std::declval())); + { + SharedStorage a; + assert(a.get() == &a); + } + + // 10. Test that the object really is in shared memory. + { + SharedStorage a; + assert(__isShared((&a).get())); + } +} + +C2H_TEST("Static shared storage", "") +{ + test_static_shared_storage<<<1, 1>>>(); + CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess); +} + +enum class State +{ + uninitialized, + constructed, + destroyed, +}; + +__device__ State state{State::uninitialized}; + +__device__ void reset_state() +{ + __syncthreads(); + if (threadIdx.x == 0) + { + state = State::uninitialized; + } + __syncthreads(); +} + +__device__ void check_state(State state) +{ + __syncthreads(); + assert(state == state); +} + +struct TestType +{ + __device__ TestType(int value) noexcept + : value_{value} + { + state = State::constructed; + } + + __device__ ~TestType() + { + state = State::destroyed; + } + + int value_; +}; + +__global__ void test_static_shared() +{ + using SST = cudax::static_shared; + + // 0. Test static public members. + static_assert(SST::default_thread_index == uint3{0, 0, 0}); + static_assert(SST::size == sizeof(TestType)); + static_assert(SST::alignment == alignof(TestType)); + + // 1. Test that the type is not default constructible. + static_assert(!cuda::std::is_default_constructible_v); + + // 2. Test that the type is nothrow constructible from cuda::no_init_t; + static_assert(cuda::std::is_nothrow_constructible_v); + reset_state(); + { + SST a{cuda::no_init}; + check_state(State::uninitialized); + } + check_state(State::destroyed); + + // 3. Test that the type is nothrow constructible from int. + static_assert(cuda::std::is_nothrow_constructible_v); + reset_state(); + { + SST a{10}; + check_state(State::constructed); + + assert(a.get().value_ == 10); + } + check_state(State::destroyed); + + // 4. Test that the type is nothrow constructible from short. + static_assert(cuda::std::is_nothrow_constructible_v); + + // 5. Test that the type is not copyable. + static_assert(!cuda::std::is_copy_constructible_v); + + // 6. Test that the type is not movable. + static_assert(!cuda::std::is_move_constructible_v); + + // 7. Test that the type is not copy assignable. + static_assert(!cuda::std::is_copy_assignable_v); + + // 8. Test that the type is not move assignable. + static_assert(!cuda::std::is_move_assignable_v); + + // 9. Test that if the type is constructed multiple times, the actual address of the objects is not the same. + { + SST a{cuda::no_init}; + SST b{cuda::no_init}; + assert(&a != &b); + } + + // 10. Test the construct method. + static_assert(cuda::std::is_same_v().construct(int{}))>); + reset_state(); + { + SST a{cuda::no_init}; + check_state(State::uninitialized); + + a.construct(10); + check_state(State::constructed); + + assert(a.get().value_ == 10); + } + check_state(State::destroyed); + + // 11. Test the construct_by method. + static_assert(cuda::std::is_same_v().construct_by(uint3{}, int{}))>); + reset_state(); + { + SST a{cuda::no_init}; + check_state(State::uninitialized); + + // Construct the object by thread 1 instead of thread 0. + a.construct_by(uint3{1, 0, 0}, 10); + check_state(State::constructed); + + assert(a.get().value_ == 10); + } + check_state(State::destroyed); + + // 12. Test the destroy method. + static_assert(cuda::std::is_same_v().destroy())>); + reset_state(); + { + SST a{cuda::no_init}; + check_state(State::uninitialized); + + a.destroy(); + check_state(State::destroyed); + } + check_state(State::destroyed); + + reset_state(); + { + SST a{10}; + check_state(State::uninitialized); + + a.destroy(); + check_state(State::destroyed); + } + check_state(State::destroyed); + + // 13. Test the destroy_by method. + static_assert(cuda::std::is_same_v().destroy())>); + { + const uint3 tid{1, 0, 0}; + + reset_state(); + { + SST a{cuda::no_init}; + check_state(State::uninitialized); + + a.destroy_by(tid); + check_state(State::destroyed); + } + check_state(State::destroyed); + + reset_state(); + { + SST a{10}; + check_state(State::uninitialized); + + a.destroy_by(tid); + check_state(State::destroyed); + } + check_state(State::destroyed); + } + + // 14. Test get method. + static_assert(cuda::std::is_same_v().get())>); + static_assert(noexcept(cuda::std::declval().get())); + { + SST a{128}; + + __syncthreads(); + assert(a.get().value_ == 128); + } + + // 15. Test operator T&. + static_assert(cuda::std::is_nothrow_convertible_v); + { + SST a{128}; + + __syncthreads(); + TestType& b = a; + assert(b.value_ == 128); + } + + // 16. Test operator&. + static_assert(cuda::std::is_same_v, decltype(&cuda::std::declval())>); + static_assert(noexcept(&cuda::std::declval())); + { + SST a{128}; + assert((&a).get() != nullptr); + } + + // 17. Test that the object really is in shared memory. + { + SST a{128}; + assert(cuda::device::is_object_from(a.get(), cuda::device::address_space::shared)); + } +} + +C2H_TEST("Static shared", "") +{ + test_static_shared<<<1, 2>>>(); + CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess); +} + +__global__ void test_shared_memory_ptr() +{ + using T = int; + using SMP = cudax::shared_memory_ptr; + + // 0. Test public type aliases. + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + + // 1. Test that the type is not default constructible. + static_assert(!cuda::std::is_default_constructible_v); + + // 2. Test that the type is not constructible from nullptr_t. + static_assert(!cuda::std::is_constructible_v); + + // 3. Test that the type is nothrow constructible from cuda::no_init_t. + static_assert(cuda::std::is_nothrow_constructible_v); + + // 4. Test that the type is explicitly nothrow constructible from T*. + static_assert(cuda::std::is_nothrow_constructible_v); + static_assert(!cuda::std::is_convertible_v); + { + __shared__ T value; + SMP p{&value}; + assert(p.get() == &value); + } + + // 7. Test that the type is nothrow constructible from another instance of different type. + // static_assert(cuda::std::is_nothrow_constructible_v, SMP>); + // static_assert(cuda::std::is_convertible_v>); + // { + // __shared__ T value; + // SMP a{&value}; + // cudax::static_shared p{a}; + // assert(a.get() == p.get()); + // } + + // 8. Test that the type is trivially copyable. + static_assert(cuda::std::is_trivially_copyable_v); + + // 9. Test that the type is trivially movable. + static_assert(cuda::std::is_trivially_move_constructible_v); + + // 10. Test that the type is trivially copy assignable. + static_assert(cuda::std::is_trivially_copy_assignable_v); + + // 11. Test that the type is trivially move assignable. + static_assert(cuda::std::is_trivially_move_assignable_v); + + // 12. Test reset method. + static_assert(cuda::std::is_same_v().reset(cuda::std::declval()))>); + static_assert(noexcept(cuda::std::declval().reset(cuda::std::declval()))); + { + __shared__ T value; + SMP a{cuda::no_init}; + a.reset(&value); + assert(a.get() == &value); + } + + // 13. Test swap method. + static_assert(cuda::std::is_same_v().swap(cuda::std::declval()))>); + static_assert(noexcept(cuda::std::declval().swap(cuda::std::declval()))); + { + __shared__ T value1; + __shared__ T value2; + SMP a{&value1}; + SMP b{&value2}; + a.swap(b); + assert(a.get() == &value2); + assert(b.get() == &value1); + } + + // 14. Test get method. + static_assert(cuda::std::is_same_v().get())>); + static_assert(noexcept(cuda::std::declval().get())); + { + __shared__ T value; + const SMP a{&value}; + assert(a.get() == &value); + } + + // 14. Test operator bool. + static_assert(!cuda::std::is_convertible_v); + static_assert(noexcept(cuda::std::declval().operator bool())); + { + const SMP a{cuda::no_init}; + assert(static_cast(a)); + } + + // 15. Test operator->. + static_assert(cuda::std::is_same_v().operator->())>); + static_assert(noexcept(cuda::std::declval().operator->())); + { + __shared__ T value; + const SMP a{&value}; + assert(a.operator->() == &value); + } + + // 16. Test operator*. + static_assert(cuda::std::is_same_v().operator*())>); + static_assert(noexcept(cuda::std::declval().operator*())); + { + __shared__ T value; + const SMP a{&value}; + assert(&a.operator*() == &value); + } + + // 17. Test operator T*. + static_assert(cuda::std::is_same_v().operator T*())>); + static_assert(!cuda::std::is_convertible_v); + static_assert(noexcept(cuda::std::declval().operator T*())); + { + __shared__ T value; + const SMP a{&value}; + assert(a.operator T*() == &value); + } +} + +C2H_TEST("Shared memory pointer", "") +{ + test_shared_memory_ptr<<<1, 1>>>(); + CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess); +}