diff --git a/cudax/examples/shared_memory_abstractions.cu b/cudax/examples/shared_memory_abstractions.cu
new file mode 100644
index 00000000000..894d9634eb4
--- /dev/null
+++ b/cudax/examples/shared_memory_abstractions.cu
@@ -0,0 +1,136 @@
+#include <cuda/std/cassert>
+
+#include <cuda/experimental/memory.cuh>
+
+#include <cstdio>
+
+namespace cudax = cuda::experimental;
+
+#define thread_printf(FMT, ...) \
+  printf("[%d, %d, %d]: " FMT, threadIdx.x, threadIdx.y, threadIdx.z __VA_OPT__(, ) __VA_ARGS__)
+
+struct SharedObj
+{
+  __device__ SharedObj()
+  {
+    thread_printf("Default constructing...\n");
+  }
+
+  __device__ SharedObj(int v)
+      : value_{v}
+  {
+    thread_printf("Constructing with %d...\n", v);
+  }
+
+  __device__ void use()
+  {
+    thread_printf("Using with value (%d)...\n", value_);
+  }
+
+  __device__ ~SharedObj()
+  {
+    thread_printf("Destructing...\n");
+  }
+
+  int value_{0};
+};
+
+__device__ void use(cudax::shared_memory_ptr<SharedObj> ptr)
+{
+  ptr->use();
+}
+
+__global__ void demo1()
+{
+  // The default constructor is called by thread 0.
+  cudax::static_shared<SharedObj> shared_obj{};
+
+  // Wait for the construction to complete.
+  __syncthreads();
+
+  use(&shared_obj);
+
+  // Wait until all threads are done using the shared object.
+  __syncthreads();
+
+  // The object will be destructed by thread 0 when the object goes out of scope.
+}
+
+__global__ void demo2()
+{
+  cudax::static_shared<SharedObj> shared_obj1{1};
+  cudax::static_shared<SharedObj> shared_obj2{cuda::no_init};
+
+  // Oops, is uninitialized, would cause assertion to trigger.
+  // use(&shared_obj2);
+
+  // Construct shared_obj2 by thread 1.
+  shared_obj2.construct_by({1, 0, 0}, 2);
+
+  // Wait for the construction to complete.
+  __syncthreads();
+
+  use(&shared_obj1);
+  use(&shared_obj2);
+
+  // Wait until all threads are done using the shared object.
+  __syncthreads();
+
+  // Manually destroy the shared_obj1 by thread 0.
+  shared_obj1.destroy();
+
+  // Manually destroy the shared_obj2 by thread 1.
+  shared_obj2.destroy_by({1, 0, 0});
+
+  // Oops, is already destructed, would cause assertion to trigger.
+  // shared_obj1.destroy();
+
+  // The destructor will not do anything.
+}
+
+__global__ void demo3()
+{
+  // Create a shared buffer with 32 bytes of storage and 16 bytes of alignment.
+  cudax::static_shared_storage<32, 16> shared_buff;
+
+  // Obtain pointer to the shared buffer.
+  auto ptr = cudax::static_pointer_cast<SharedObj>(&shared_buff);
+
+  // Construct SharedObj by thread 0.
+  if (threadIdx.x == 0)
+  {
+    new (ptr.get()) SharedObj{123};
+  }
+
+  // Wait for the construction to complete.
+  __syncthreads();
+
+  use(ptr);
+
+  // Wait for all threads to complete before destructing the object.
+  __syncthreads();
+
+  // Destruct the object by thread 0.
+  if (threadIdx.x == 0)
+  {
+    ptr->~SharedObj();
+  }
+}
+
+int main()
+{
+  printf("Demo1:\n");
+  demo1<<<1, 4>>>();
+  assert(cudaDeviceSynchronize() == cudaSuccess);
+  printf("\n");
+
+  printf("Demo2:\n");
+  demo2<<<1, 4>>>();
+  assert(cudaDeviceSynchronize() == cudaSuccess);
+  printf("\n");
+
+  printf("Demo3:\n");
+  demo3<<<1, 4>>>();
+  assert(cudaDeviceSynchronize() == cudaSuccess);
+  printf("\n");
+}
diff --git a/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh b/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh
new file mode 100644
index 00000000000..9162dae1b23
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory/shared_memory_ptr.cuh
@@ -0,0 +1,267 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H
+#define _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/address_space.h>
+#include <cuda/__utility/no_init.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/__type_traits/is_void.h>
+#include <cuda/std/__utility/swap.h>
+
+#include <cuda/experimental/__memory/shared_memory_ptr.cuh>
+
+namespace cuda::experimental
+{
+enum class __smem_addr_t : unsigned
+{
+};
+
+//! @brief A pointer to shared memory.
+template <class _Tp>
+class shared_memory_ptr
+{
+  unsigned __smem_addr_; //!< The address of the shared memory.
+
+public:
+  using element_type = _Tp; //!< The element type.
+  using pointer      = _Tp*; //!< The pointer type.
+
+  shared_memory_ptr() = delete;
+
+  shared_memory_ptr(::cuda::std::nullptr_t) = delete;
+
+  //! @brief Constructs the object to uninitialized state.
+  _CCCL_DEVICE_API explicit shared_memory_ptr(::cuda::no_init_t) noexcept {}
+
+  //! @brief Constructs the object from shared memory address.
+  //!
+  //! @param __addr The shared memory address.
+  _CCCL_DEVICE_API explicit shared_memory_ptr(__smem_addr_t __addr) noexcept
+      : __smem_addr_{static_cast<unsigned>(__addr)}
+  {}
+
+  //! @brief Constructs the object from shared memory pointer.
+  //!
+  //! @param __ptr The shared memory pointer.
+  _CCCL_DEVICE_API explicit shared_memory_ptr(_Tp* __ptr) noexcept
+  {
+    reset(__ptr);
+  }
+
+  //! @brief Constructs the object from another \c shared_memory_ptr with different element type such that the other
+  //!        \c pointer_type is convertible to this \c pointer_type.
+  //!
+  //! @param __other The other \c shared_memory_ptr.
+  _CCCL_TEMPLATE(class _Up)
+  _CCCL_REQUIRES(::cuda::std::is_convertible_v<_Up*, _Tp*>)
+  _CCCL_DEVICE_API shared_memory_ptr(shared_memory_ptr<_Up> __other) noexcept
+      : __smem_addr_{__other.__smem_addr_}
+  {}
+
+  _CCCL_HIDE_FROM_ABI shared_memory_ptr(const shared_memory_ptr&) noexcept = default;
+
+  _CCCL_HIDE_FROM_ABI shared_memory_ptr(shared_memory_ptr&&) noexcept = default;
+
+  _CCCL_HIDE_FROM_ABI shared_memory_ptr& operator=(const shared_memory_ptr&) noexcept = default;
+
+  //! @brief Resets the pointer to the given pointer.
+  //!
+  //! @param __ptr The pointer to reset to.
+  //!
+  //! @returns The previous pointer.
+  _CCCL_DEVICE_API _Tp* reset(_Tp* __ptr) noexcept
+  {
+    _CCCL_ASSERT(::cuda::device::is_address_from(__ptr, ::cuda::device::address_space::shared),
+                 "pointer is not from shared memory");
+    _Tp* __ret   = get();
+    __smem_addr_ = ::__cvta_generic_to_shared(__ptr);
+    return __ret;
+  }
+
+  //! @brief Swaps the pointers of two shared_memory_ptrs.
+  _CCCL_DEVICE_API constexpr void swap(shared_memory_ptr& __other) noexcept
+  {
+    ::cuda::std::swap(__smem_addr_, __other.__smem_addr_);
+  }
+
+  //! @brief Gets the stored address.
+  //!
+  //! @returns The stored address.
+  [[nodiscard]] _CCCL_DEVICE_API __smem_addr_t __get_smem_addr() const noexcept
+  {
+    return __smem_addr_t{__smem_addr_};
+  }
+
+  //! @brief Gets the stored pointer.
+  //!
+  //! @returns The pointer.
+  [[nodiscard]] _CCCL_DEVICE_API _Tp* get() const noexcept
+  {
+    return static_cast<_Tp*>(::__cvta_shared_to_generic(__smem_addr_));
+  }
+
+  //! @brief Conversion operator to bool (always returns \c true).
+  _CCCL_DEVICE_API explicit constexpr operator bool() const noexcept
+  {
+    return true;
+  }
+
+  //! @brief Arrow operator.
+  //!
+  //! @return The stored pointer.
+  [[nodiscard]] _CCCL_DEVICE_API _Tp* operator->() const noexcept
+  {
+    return get();
+  }
+
+  //! @brief Dereference operator.
+  //!
+  //! @return Reference to the object pointed to by the stored pointer.
+  _CCCL_TEMPLATE(class _Tp2 = _Tp)
+  _CCCL_REQUIRES((!::cuda::std::is_void_v<_Tp2>) )
+  [[nodiscard]] _CCCL_DEVICE_API _Tp2& operator*() const noexcept
+  {
+    return *get();
+  }
+
+  //! @brief Conversion operator to \c pointer_type.
+  //!
+  //! @returns The stored pointer.
+  _CCCL_DEVICE_API explicit operator _Tp*() const noexcept
+  {
+    return get();
+  }
+
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator==(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ == __rhs.__smem_addr_;
+  }
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator!=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ != __rhs.__smem_addr_;
+  }
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator<(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ < __rhs.__smem_addr_;
+  }
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator<=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ <= __rhs.__smem_addr_;
+  }
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator>(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ > __rhs.__smem_addr_;
+  }
+  template <class _Rhs>
+  [[nodiscard]] _CCCL_DEVICE_API friend bool
+  operator>=(const shared_memory_ptr& __lhs, const shared_memory_ptr<_Rhs>& __rhs) noexcept
+  {
+    return __lhs.__smem_addr_ >= __rhs.__smem_addr_;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator==(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return false;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator!=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator<(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return false;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator<=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return false;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator>(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator>=(shared_memory_ptr, ::cuda::std::nullptr_t) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator==(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return false;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator!=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator<(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator<=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return true;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator>(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return false;
+  }
+  [[nodiscard]] _CCCL_DEVICE_API friend bool operator>=(::cuda::std::nullptr_t, shared_memory_ptr) noexcept
+  {
+    return false;
+  }
+};
+
+template <class _Tp>
+_CCCL_HOST_DEVICE shared_memory_ptr(_Tp*) -> shared_memory_ptr<_Tp>;
+
+// todo: constraints
+template <class _Tp, class _Up>
+[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> static_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept
+{
+  return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()};
+}
+
+// todo: constraints
+template <class _Tp, class _Up>
+[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> const_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept
+{
+  return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()};
+}
+
+// todo: constraints
+template <class _Tp, class _Up>
+[[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> reinterpret_pointer_cast(shared_memory_ptr<_Up> __ptr) noexcept
+{
+  return shared_memory_ptr<_Tp>{__ptr.__get_smem_addr()};
+}
+} // namespace cuda::experimental
+
+#endif // _CUDA_EXPERIMENTAL___MEMORY_SHARED_MEMORY_PTR_H
diff --git a/cudax/include/cuda/experimental/__memory/static_shared.cuh b/cudax/include/cuda/experimental/__memory/static_shared.cuh
new file mode 100644
index 00000000000..71d6ba1fadf
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory/static_shared.cuh
@@ -0,0 +1,197 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H
+#define _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/no_init.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__limits/numeric_limits.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__type_traits/is_constructible.h>
+#include <cuda/std/__type_traits/is_destructible.h>
+#include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#include <cuda/std/__type_traits/is_nothrow_destructible.h>
+#include <cuda/std/__type_traits/is_void.h>
+#include <cuda/std/__utility/forward.h>
+
+#include <cuda/experimental/__memory/static_shared_storage.cuh>
+
+namespace cuda::experimental
+{
+//! @brief A RAII wrapper for an object living in static shared memory.
+//!
+//! @tparam _Tp The type of the object.
+//! @tparam _Align The alignment of the object.
+template <class _Tp, ::cuda::std::size_t _Align = alignof(_Tp)>
+class [[nodiscard]] static_shared : static_shared_storage<sizeof(_Tp), _Align>
+{
+  static_assert(!::cuda::std::is_void_v<_Tp>, "_Tp must not be void");
+  static_assert(::cuda::is_power_of_two(_Align), "_Align must be power of two");
+  static_assert(_Align >= alignof(_Tp), "_Align must be at least alignof(_Tp)");
+
+  static_assert(!::cuda::std::is_array_v<_Tp>, "Arrays are not supported yet");
+
+  using __base_type = static_shared_storage<sizeof(_Tp), _Align>;
+
+  enum class __state_type
+  {
+    __uninitialized, //!< The object is created but in uninitialized state.
+    __constructed, //!< The object is created and initialized.
+    __destroyed, //!< The object is destroyed.
+  };
+
+  __state_type __state_{__state_type::__uninitialized}; //!< The state of the object.
+
+  //! @brief Gets the pointer to the object stored in the static shared memory.
+  //!
+  //! @return The pointer to the object stored in the static shared memory.
+  [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> __ptr() const noexcept
+  {
+    return shared_memory_ptr<_Tp>{__base_type::get().__get_smem_addr()};
+  }
+
+  //! @brief Implements the destruction of the object. No assertions are performed.
+  //!
+  //! @param __chosen_thread The thread that will perform the destruction.
+  _CCCL_DEVICE_API void __destroy_by_impl(::uint3 __chosen_thread) noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>)
+  {
+    if (__state_ == __state_type::__constructed)
+    {
+      if (__chosen_thread.x == threadIdx.x && __chosen_thread.y == threadIdx.y && __chosen_thread.z == threadIdx.z)
+      {
+        ::cuda::std::__destroy_at(__ptr().get());
+      }
+      __state_ = __state_type::__destroyed;
+    }
+  }
+
+public:
+  //! @brief The default 3D index of the thread used to construct/destroy the object.
+  static constexpr ::uint3 default_thread_index{0, 0, 0};
+
+  using value_type = _Tp; //!< The type of the object stored in the static shared memory.
+  using __base_type::alignment; //!< The alignment of the static shared memory.
+  using __base_type::size; //!< The size of the static shared memory.
+
+  //! @brief Allocates the static shared memory without constructing the object. The object is expected to be
+  //!        constructed later by calling construct(...)/construct_by(...) methods.
+  _CCCL_DEVICE_API _CCCL_FORCEINLINE static_shared(cuda::no_init_t) noexcept {}
+
+  //! @brief Allocates the static shared memory and constructs the object.
+  //!
+  //! @param __args The arguments to forward to the constructor of the object.
+  _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args)
+  _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>)
+  _CCCL_DEVICE_API _CCCL_FORCEINLINE
+  static_shared(_Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>)
+  {
+    construct(::cuda::std::forward<_Args>(__args)...);
+  }
+
+  static_shared(const static_shared&) = delete;
+
+  static_shared(static_shared&&) = delete;
+
+  //! @brief Destroys the stored object.
+  _CCCL_DEVICE_API ~static_shared() noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>)
+  {
+    __destroy_by_impl(default_thread_index);
+  }
+
+  static_shared& operator=(const static_shared&) = delete;
+
+  static_shared& operator=(static_shared&&) = delete;
+
+  //! @brief Constructs the stored object in-place by calling its constructor with the given arguments by the
+  //!        \c default_thread_index thread.
+  //!
+  //! @param __args The arguments to forward to the constructor of the stored object.
+  _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args)
+  _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>)
+  _CCCL_DEVICE_API void construct(_Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>)
+  {
+    construct_by(default_thread_index, ::cuda::std::forward<_Args>(__args)...);
+  }
+
+  //! @brief Constructs the stored object in-place by calling its constructor with the given arguments by the thread at
+  //!        the given index.
+  //!
+  //! @param __chosen_thread The thread index of the thread that will construct the object.
+  //! @param __args The arguments to forward to the constructor of the stored object.
+  _CCCL_TEMPLATE(class _Tp2 = _Tp, class... _Args)
+  _CCCL_REQUIRES(::cuda::std::is_constructible_v<_Tp2, _Args...>)
+  _CCCL_DEVICE_API void construct_by(::uint3 __chosen_thread,
+                                     _Args&&... __args) noexcept(::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>)
+  {
+    _CCCL_ASSERT(__state_ != __state_type::__constructed, "static shared memory object is already constructed");
+    _CCCL_ASSERT(__state_ != __state_type::__destroyed, "static shared memory object cannot be reconstructed");
+    if (__chosen_thread.x == threadIdx.x && __chosen_thread.y == threadIdx.y && __chosen_thread.z == threadIdx.z)
+    {
+      ::cuda::std::__construct_at(__ptr().get(), ::cuda::std::forward<_Args>(__args)...);
+    }
+    __state_ = __state_type::__constructed;
+  }
+
+  //! @brief Destroys the object stored in the static shared memory by the \c default_thread_index thread and
+  //!        invalidates this instance.
+  _CCCL_DEVICE_API void destroy() noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>)
+  {
+    destroy_by(default_thread_index);
+  }
+
+  //! @brief Destroys the object stored in the static shared memory by the \c __chosen_thread thread and
+  //!        invalidates this instance.
+  //!
+  //! @param __chosen_thread The thread that destroys the object.
+  _CCCL_DEVICE_API void destroy_by(::uint3 __chosen_thread) noexcept(::cuda::std::is_nothrow_destructible_v<_Tp>)
+  {
+    _CCCL_ASSERT(__state_ != __state_type::__destroyed, "destroying already destroyed static shared memory object");
+    __destroy_by_impl(__chosen_thread);
+  }
+
+  //! @brief Gets a reference to the object stored in the static shared memory.
+  //!
+  //! @returns A reference to the object stored in the static shared memory.
+  [[nodiscard]] _CCCL_DEVICE_API _Tp& get() const noexcept
+  {
+    _CCCL_ASSERT(__state_ != __state_type::__uninitialized, "accessing uninitialized static shared memory object");
+    _CCCL_ASSERT(__state_ != __state_type::__destroyed, "accessing destroyed static shared memory object");
+    return *__ptr();
+  }
+
+  //! @brief Gets a pointer to the object stored in the static shared memory.
+  //!
+  //! @returns A pointer to the object stored in the static shared memory.
+  [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<_Tp> operator&() const noexcept
+  {
+    return __ptr();
+  }
+
+  //! @brief Casts the static shared memory object to a reference to the object stored in the static shared memory.
+  _CCCL_DEVICE_API operator _Tp&() const noexcept
+  {
+    return get();
+  }
+};
+} // namespace cuda::experimental
+
+#endif // _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_H
diff --git a/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh b/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh
new file mode 100644
index 00000000000..cb6aa895b23
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory/static_shared_storage.cuh
@@ -0,0 +1,137 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H
+#define _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__cmath/pow2.h>
+#include <cuda/std/__charconv/to_chars.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__utility/integer_sequence.h>
+#include <cuda/std/array>
+
+#include <cuda/experimental/__memory/shared_memory_ptr.cuh>
+
+namespace cuda::experimental
+{
+[[nodiscard]] _CCCL_CONSTEVAL _CCCL_DEVICE_API ::cuda::std::size_t __size_t_to_str_size(::cuda::std::size_t __n) noexcept
+{
+  ::cuda::std::size_t __ret = 0;
+  while (__n > 0)
+  {
+    __n /= 10;
+    ++__ret;
+  }
+  return __ret;
+}
+
+template <::cuda::std::size_t _StrSize>
+[[nodiscard]] _CCCL_CONSTEVAL _CCCL_DEVICE_API auto __size_t_to_str(::cuda::std::size_t __n) noexcept
+{
+  ::cuda::std::array<char, _StrSize> __ret{};
+  ::cuda::std::to_chars(__ret.data(), __ret.data() + _StrSize, __n);
+  return __ret;
+}
+
+template <char... _Cs>
+struct __static_cstr
+{
+  static constexpr char __value[]{_Cs..., '\0'};
+};
+
+template <::cuda::std::size_t _Size,
+          ::cuda::std::size_t _Align,
+          ::cuda::std::size_t... _SizeIdx,
+          ::cuda::std::size_t... _AlignIdx>
+[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE unsigned __cccl_alloc_static_shared_impl(
+  ::cuda::std::integer_sequence<::cuda::std::size_t, _SizeIdx...>,
+  ::cuda::std::integer_sequence<::cuda::std::size_t, _AlignIdx...>) noexcept
+{
+  constexpr auto __size_str  = __size_t_to_str<sizeof...(_SizeIdx)>(_Size);
+  constexpr auto __align_str = __size_t_to_str<sizeof...(_AlignIdx)>(_Align);
+
+  using _SizeCStr  = __static_cstr<__size_str[_SizeIdx]...>;
+  using _AlignCStr = __static_cstr<__align_str[_AlignIdx]...>;
+
+  unsigned __ret;
+  asm(R"({
+  .shared .align %2 .b8 _cccl_static_shared_storage[%1];
+  mov.b32 %0, _cccl_static_shared_storage;
+  })"
+      : "=r"(__ret)
+      : "C"(_SizeCStr::__value), "C"(_AlignCStr::__value));
+  return __ret;
+}
+
+template <::cuda::std::size_t _Size, ::cuda::std::size_t _Align>
+[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE unsigned __cccl_alloc_static_shared() noexcept
+{
+  return __cccl_alloc_static_shared_impl<_Size, _Align>(
+    ::cuda::std::make_index_sequence<__size_t_to_str_size(_Size)>{},
+    ::cuda::std::make_index_sequence<__size_t_to_str_size(_Align)>{});
+}
+
+//! @brief Allocates static shared memory with the given size and alignment.
+//!
+//! @tparam _Size  Size of the storage in bytes.
+//! @tparam _Align  Alignment of the storage in bytes.
+template <::cuda::std::size_t _Size, ::cuda::std::size_t _Align>
+class [[nodiscard]] static_shared_storage
+{
+  static_assert(::cuda::is_power_of_two(_Align), "_Align must be power of two");
+
+  unsigned __smem_addr_; //!< Shared memory address of the storage.
+
+public:
+  static constexpr ::cuda::std::size_t size      = _Size; //!< Size of the storage.
+  static constexpr ::cuda::std::size_t alignment = _Align; //!< Alignment of the storage.
+
+  //! @brief Allocates the static shared memory and constructs the handle.
+  _CCCL_DEVICE_API _CCCL_FORCEINLINE static_shared_storage() noexcept
+      : __smem_addr_{__cccl_alloc_static_shared<_Size, _Align>()}
+  {}
+
+  static_shared_storage(const static_shared_storage&) = delete;
+
+  static_shared_storage(static_shared_storage&&) = delete;
+
+  static_shared_storage& operator=(const static_shared_storage&) = delete;
+
+  static_shared_storage& operator=(static_shared_storage&&) = delete;
+
+  //! @brief Obtains the address of the storage.
+  //!
+  //! @return The address of the storage.
+  [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<void> get() const noexcept
+  {
+    return shared_memory_ptr<void>{__smem_addr_t{__smem_addr_}};
+  }
+
+  //! @brief Obtains the address of the storage.
+  //!
+  //! @return The address of the storage.
+  [[nodiscard]] _CCCL_DEVICE_API shared_memory_ptr<void> operator&() const noexcept
+  {
+    return get();
+  }
+};
+} // namespace cuda::experimental
+
+#endif // _CUDA_EXPERIMENTAL___MEMORY_STATIC_SHARED_STORAGE_H
diff --git a/cudax/include/cuda/experimental/memory.cuh b/cudax/include/cuda/experimental/memory.cuh
new file mode 100644
index 00000000000..71368a554b4
--- /dev/null
+++ b/cudax/include/cuda/experimental/memory.cuh
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL_MEMORY
+#define _CUDA_EXPERIMENTAL_MEMORY
+
+#include <cuda/experimental/__memory/shared_memory_ptr.cuh>
+#include <cuda/experimental/__memory/static_shared.cuh>
+#include <cuda/experimental/__memory/static_shared_storage.cuh>
+
+#endif // _CUDA_EXPERIMENTAL_MEMORY
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 845e5123d5b..174534fd354 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -106,6 +106,10 @@ cudax_add_catch2_test(test_target algorithm
     algorithm/copy.cu
 )
 
+cudax_add_catch2_test(test_target memory_static_shared
+    memory/static_shared.cu
+)
+
 if (cudax_ENABLE_CUFILE)
   cudax_add_catch2_test(test_target cufile.driver_attributes
       cufile/driver_attributes.cu
diff --git a/cudax/test/memory/static_shared.cu b/cudax/test/memory/static_shared.cu
new file mode 100644
index 00000000000..6eeb0258ade
--- /dev/null
+++ b/cudax/test/memory/static_shared.cu
@@ -0,0 +1,425 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#undef NDEBUG
+
+#include <cuda/memory>
+#include <cuda/std/cassert>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <cuda/experimental/memory.cuh>
+
+#include <utility.cuh>
+
+__global__ void test_static_shared_storage()
+{
+  constexpr cuda::std::size_t size  = 32;
+  constexpr cuda::std::size_t align = 16;
+
+  using SharedStorage = cudax::static_shared_storage<size, align>;
+
+  // 0. Test static public members.
+  static_assert(SharedStorage::size == size);
+  static_assert(SharedStorage::alignment == align);
+
+  // 1. Test that the type is nothrow default constructible.
+  static_assert(cuda::std::is_nothrow_default_constructible_v<SharedStorage>);
+
+  // 2. Test that the type is not copyable.
+  static_assert(!cuda::std::is_copy_constructible_v<SharedStorage>);
+
+  // 3. Test that the type is not movable.
+  static_assert(!cuda::std::is_move_constructible_v<SharedStorage>);
+
+  // 4. Test that the type is trivially destructible.
+  static_assert(cuda::std::is_trivially_destructible_v<SharedStorage>);
+
+  // 5. Test that the type is not copy assignable.
+  static_assert(!cuda::std::is_copy_assignable_v<SharedStorage>);
+
+  // 6. Test that the type is not move assignable.
+  static_assert(!cuda::std::is_move_assignable_v<SharedStorage>);
+
+  // 7. Test that if the type is constructed multiple times, the actual address of the objects is not the same.
+  {
+    SharedStorage a;
+    SharedStorage b;
+    assert(a.get() != b.get());
+  }
+
+  // 8. Test get method.
+  static_assert(
+    cuda::std::is_same_v<cudax::shared_memory_ptr<void>, decltype(cuda::std::declval<const SharedStorage>().get())>);
+  static_assert(noexcept(cuda::std::declval<const SharedStorage>().get()));
+  {
+    SharedStorage a;
+    assert(&a
+           == cudax::shared_memory_ptr{::__cvta_shared_to_generic(cuda::std::to_underlying((&a).__get_smem_addr()))});
+  }
+
+  // 9. Test operator&.
+  static_assert(
+    cuda::std::is_same_v<cudax::shared_memory_ptr<void>, decltype(&cuda::std::declval<const SharedStorage>())>);
+  static_assert(noexcept(&cuda::std::declval<const SharedStorage>()));
+  {
+    SharedStorage a;
+    assert(a.get() == &a);
+  }
+
+  // 10. Test that the object really is in shared memory.
+  {
+    SharedStorage a;
+    assert(__isShared((&a).get()));
+  }
+}
+
+C2H_TEST("Static shared storage", "")
+{
+  test_static_shared_storage<<<1, 1>>>();
+  CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+}
+
+enum class State
+{
+  uninitialized,
+  constructed,
+  destroyed,
+};
+
+__device__ State state{State::uninitialized};
+
+__device__ void reset_state()
+{
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    state = State::uninitialized;
+  }
+  __syncthreads();
+}
+
+__device__ void check_state(State state)
+{
+  __syncthreads();
+  assert(state == state);
+}
+
+struct TestType
+{
+  __device__ TestType(int value) noexcept
+      : value_{value}
+  {
+    state = State::constructed;
+  }
+
+  __device__ ~TestType()
+  {
+    state = State::destroyed;
+  }
+
+  int value_;
+};
+
+__global__ void test_static_shared()
+{
+  using SST = cudax::static_shared<TestType>;
+
+  // 0. Test static public members.
+  static_assert(SST::default_thread_index == uint3{0, 0, 0});
+  static_assert(SST::size == sizeof(TestType));
+  static_assert(SST::alignment == alignof(TestType));
+
+  // 1. Test that the type is not default constructible.
+  static_assert(!cuda::std::is_default_constructible_v<SST>);
+
+  // 2. Test that the type is nothrow constructible from cuda::no_init_t;
+  static_assert(cuda::std::is_nothrow_constructible_v<SST, cuda::no_init_t>);
+  reset_state();
+  {
+    SST a{cuda::no_init};
+    check_state(State::uninitialized);
+  }
+  check_state(State::destroyed);
+
+  // 3. Test that the type is nothrow constructible from int.
+  static_assert(cuda::std::is_nothrow_constructible_v<SST, int>);
+  reset_state();
+  {
+    SST a{10};
+    check_state(State::constructed);
+
+    assert(a.get().value_ == 10);
+  }
+  check_state(State::destroyed);
+
+  // 4. Test that the type is nothrow constructible from short.
+  static_assert(cuda::std::is_nothrow_constructible_v<SST, short>);
+
+  // 5. Test that the type is not copyable.
+  static_assert(!cuda::std::is_copy_constructible_v<SST>);
+
+  // 6. Test that the type is not movable.
+  static_assert(!cuda::std::is_move_constructible_v<SST>);
+
+  // 7. Test that the type is not copy assignable.
+  static_assert(!cuda::std::is_copy_assignable_v<SST>);
+
+  // 8. Test that the type is not move assignable.
+  static_assert(!cuda::std::is_move_assignable_v<SST>);
+
+  // 9. Test that if the type is constructed multiple times, the actual address of the objects is not the same.
+  {
+    SST a{cuda::no_init};
+    SST b{cuda::no_init};
+    assert(&a != &b);
+  }
+
+  // 10. Test the construct method.
+  static_assert(cuda::std::is_same_v<void, decltype(cuda::std::declval<SST>().construct(int{}))>);
+  reset_state();
+  {
+    SST a{cuda::no_init};
+    check_state(State::uninitialized);
+
+    a.construct(10);
+    check_state(State::constructed);
+
+    assert(a.get().value_ == 10);
+  }
+  check_state(State::destroyed);
+
+  // 11. Test the construct_by method.
+  static_assert(cuda::std::is_same_v<void, decltype(cuda::std::declval<SST>().construct_by(uint3{}, int{}))>);
+  reset_state();
+  {
+    SST a{cuda::no_init};
+    check_state(State::uninitialized);
+
+    // Construct the object by thread 1 instead of thread 0.
+    a.construct_by(uint3{1, 0, 0}, 10);
+    check_state(State::constructed);
+
+    assert(a.get().value_ == 10);
+  }
+  check_state(State::destroyed);
+
+  // 12. Test the destroy method.
+  static_assert(cuda::std::is_same_v<void, decltype(cuda::std::declval<SST>().destroy())>);
+  reset_state();
+  {
+    SST a{cuda::no_init};
+    check_state(State::uninitialized);
+
+    a.destroy();
+    check_state(State::destroyed);
+  }
+  check_state(State::destroyed);
+
+  reset_state();
+  {
+    SST a{10};
+    check_state(State::uninitialized);
+
+    a.destroy();
+    check_state(State::destroyed);
+  }
+  check_state(State::destroyed);
+
+  // 13. Test the destroy_by method.
+  static_assert(cuda::std::is_same_v<void, decltype(cuda::std::declval<SST>().destroy())>);
+  {
+    const uint3 tid{1, 0, 0};
+
+    reset_state();
+    {
+      SST a{cuda::no_init};
+      check_state(State::uninitialized);
+
+      a.destroy_by(tid);
+      check_state(State::destroyed);
+    }
+    check_state(State::destroyed);
+
+    reset_state();
+    {
+      SST a{10};
+      check_state(State::uninitialized);
+
+      a.destroy_by(tid);
+      check_state(State::destroyed);
+    }
+    check_state(State::destroyed);
+  }
+
+  // 14. Test get method.
+  static_assert(cuda::std::is_same_v<TestType&, decltype(cuda::std::declval<const SST>().get())>);
+  static_assert(noexcept(cuda::std::declval<const SST>().get()));
+  {
+    SST a{128};
+
+    __syncthreads();
+    assert(a.get().value_ == 128);
+  }
+
+  // 15. Test operator T&.
+  static_assert(cuda::std::is_nothrow_convertible_v<const SST, TestType&>);
+  {
+    SST a{128};
+
+    __syncthreads();
+    TestType& b = a;
+    assert(b.value_ == 128);
+  }
+
+  // 16. Test operator&.
+  static_assert(cuda::std::is_same_v<cudax::shared_memory_ptr<TestType>, decltype(&cuda::std::declval<const SST&>())>);
+  static_assert(noexcept(&cuda::std::declval<const SST&>()));
+  {
+    SST a{128};
+    assert((&a).get() != nullptr);
+  }
+
+  // 17. Test that the object really is in shared memory.
+  {
+    SST a{128};
+    assert(cuda::device::is_object_from(a.get(), cuda::device::address_space::shared));
+  }
+}
+
+C2H_TEST("Static shared", "")
+{
+  test_static_shared<<<1, 2>>>();
+  CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+}
+
+__global__ void test_shared_memory_ptr()
+{
+  using T   = int;
+  using SMP = cudax::shared_memory_ptr<T>;
+
+  // 0. Test public type aliases.
+  static_assert(cuda::std::is_same_v<T, SMP::element_type>);
+  static_assert(cuda::std::is_same_v<T*, SMP::pointer>);
+
+  // 1. Test that the type is not default constructible.
+  static_assert(!cuda::std::is_default_constructible_v<SMP>);
+
+  // 2. Test that the type is not constructible from nullptr_t.
+  static_assert(!cuda::std::is_constructible_v<SMP, cuda::std::nullptr_t>);
+
+  // 3. Test that the type is nothrow constructible from cuda::no_init_t.
+  static_assert(cuda::std::is_nothrow_constructible_v<SMP, cuda::no_init_t>);
+
+  // 4. Test that the type is explicitly nothrow constructible from T*.
+  static_assert(cuda::std::is_nothrow_constructible_v<SMP, T*>);
+  static_assert(!cuda::std::is_convertible_v<T*, SMP>);
+  {
+    __shared__ T value;
+    SMP p{&value};
+    assert(p.get() == &value);
+  }
+
+  // 7. Test that the type is nothrow constructible from another instance of different type.
+  // static_assert(cuda::std::is_nothrow_constructible_v<cudax::static_shared<void>, SMP>);
+  // static_assert(cuda::std::is_convertible_v<SMP, cudax::static_shared<void>>);
+  // {
+  //   __shared__ T value;
+  //   SMP a{&value};
+  //   cudax::static_shared<void> p{a};
+  //   assert(a.get() == p.get());
+  // }
+
+  // 8. Test that the type is trivially copyable.
+  static_assert(cuda::std::is_trivially_copyable_v<SMP>);
+
+  // 9. Test that the type is trivially movable.
+  static_assert(cuda::std::is_trivially_move_constructible_v<SMP>);
+
+  // 10. Test that the type is trivially copy assignable.
+  static_assert(cuda::std::is_trivially_copy_assignable_v<SMP>);
+
+  // 11. Test that the type is trivially move assignable.
+  static_assert(cuda::std::is_trivially_move_assignable_v<SMP>);
+
+  // 12. Test reset method.
+  static_assert(cuda::std::is_same_v<T*, decltype(cuda::std::declval<SMP>().reset(cuda::std::declval<T*>()))>);
+  static_assert(noexcept(cuda::std::declval<SMP>().reset(cuda::std::declval<T*>())));
+  {
+    __shared__ T value;
+    SMP a{cuda::no_init};
+    a.reset(&value);
+    assert(a.get() == &value);
+  }
+
+  // 13. Test swap method.
+  static_assert(cuda::std::is_same_v<void, decltype(cuda::std::declval<SMP>().swap(cuda::std::declval<SMP&>()))>);
+  static_assert(noexcept(cuda::std::declval<SMP>().swap(cuda::std::declval<SMP&>())));
+  {
+    __shared__ T value1;
+    __shared__ T value2;
+    SMP a{&value1};
+    SMP b{&value2};
+    a.swap(b);
+    assert(a.get() == &value2);
+    assert(b.get() == &value1);
+  }
+
+  // 14. Test get method.
+  static_assert(cuda::std::is_same_v<T*, decltype(cuda::std::declval<const SMP>().get())>);
+  static_assert(noexcept(cuda::std::declval<const SMP>().get()));
+  {
+    __shared__ T value;
+    const SMP a{&value};
+    assert(a.get() == &value);
+  }
+
+  // 14. Test operator bool.
+  static_assert(!cuda::std::is_convertible_v<const SMP, bool>);
+  static_assert(noexcept(cuda::std::declval<const SMP>().operator bool()));
+  {
+    const SMP a{cuda::no_init};
+    assert(static_cast<bool>(a));
+  }
+
+  // 15. Test operator->.
+  static_assert(cuda::std::is_same_v<T*, decltype(cuda::std::declval<const SMP>().operator->())>);
+  static_assert(noexcept(cuda::std::declval<const SMP>().operator->()));
+  {
+    __shared__ T value;
+    const SMP a{&value};
+    assert(a.operator->() == &value);
+  }
+
+  // 16. Test operator*.
+  static_assert(cuda::std::is_same_v<T&, decltype(cuda::std::declval<const SMP>().operator*())>);
+  static_assert(noexcept(cuda::std::declval<const SMP>().operator*()));
+  {
+    __shared__ T value;
+    const SMP a{&value};
+    assert(&a.operator*() == &value);
+  }
+
+  // 17. Test operator T*.
+  static_assert(cuda::std::is_same_v<T*, decltype(cuda::std::declval<const SMP>().operator T*())>);
+  static_assert(!cuda::std::is_convertible_v<const SMP, T*>);
+  static_assert(noexcept(cuda::std::declval<const SMP>().operator T*()));
+  {
+    __shared__ T value;
+    const SMP a{&value};
+    assert(a.operator T*() == &value);
+  }
+}
+
+C2H_TEST("Shared memory pointer", "")
+{
+  test_shared_memory_ptr<<<1, 1>>>();
+  CUDAX_REQUIRE(cudaDeviceSynchronize() == cudaSuccess);
+}