Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions libcudacxx/include/cuda/std/__cccl/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,6 @@
# undef _CCCL_BUILTIN_MEMMOVE
#endif // _CCCL_CUDA_COMPILER(NVCC)

#if _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) \
&& _CCCL_CUDA_COMPILER(CLANG)
# define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__)
# define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete)

#if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);)
#else
Expand Down
48 changes: 48 additions & 0 deletions libcudacxx/include/cuda/std/__fwd/new.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___FWD_NEW_H
#define _CUDA_STD___FWD_NEW_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__cstddef/types.h>

#include <cuda/std/__cccl/prologue.h>

// std:: forward declarations

#if _CCCL_HAS_HOST_STD_LIB()
_CCCL_BEGIN_NAMESPACE_STD

enum class align_val_t : ::cuda::std::size_t;

_CCCL_END_NAMESPACE_STD
#endif // _CCCL_HAS_HOST_STD_LIB()

// cuda::std:: forward declarations

_CCCL_BEGIN_NAMESPACE_CUDA_STD

using ::std::align_val_t;

_CCCL_END_NAMESPACE_CUDA_STD

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA_STD___FWD_NEW_H
6 changes: 0 additions & 6 deletions libcudacxx/include/cuda/std/__internal/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@
#define _LIBCUDACXX_HAS_MONOTONIC_CLOCK() 0
#define _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() 0

#if _CCCL_CUDA_COMPILATION() || __cpp_aligned_new < 201606
# define _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() 0
#else
# define _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() 1
#endif // !_CCCL_CUDA_COMPILATION() && __cpp_aligned_new >= 201606

// We need `is_constant_evaluated` for clang and gcc. MSVC also needs extensive rework
#if !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
# define _LIBCUDACXX_HAS_CONSTEXPR_COMPLEX_OPERATIONS() 0
Expand Down
18 changes: 3 additions & 15 deletions libcudacxx/include/cuda/std/__memory/temporary_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,14 @@ template <class _Tp>
}
while (__n > 0)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (__is_overaligned_for_new(alignof(_Tp)))
if constexpr (alignof(_Tp) > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
::cuda::std::align_val_t __al = ::cuda::std::align_val_t(::cuda::std::alignment_of<_Tp>::value);
__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp), __al));
__r.first = static_cast<_Tp*>(::cuda::std::__cccl_operator_new(__n * sizeof(_Tp), align_val_t{alignof(_Tp)}));
}
else
{
__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
__r.first = static_cast<_Tp*>(::cuda::std::__cccl_operator_new(__n * sizeof(_Tp)));
}
#else // ^^^ _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() ^^^ / vvv !_LIBCUDACXX_HAS_ALIGNED_ALLOCATION() vvv
if (__is_overaligned_for_new(alignof(_Tp)))
{
// Since aligned operator new is unavailable, return an empty
// buffer rather than one with invalid alignment.
return __r;
}

__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
#endif // !_LIBCUDACXX_HAS_ALIGNED_ALLOCATION()

if (__r.first)
{
Expand Down
101 changes: 52 additions & 49 deletions libcudacxx/include/cuda/std/__new/allocate.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,105 +22,108 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__cstdlib/aligned_alloc.h>
#include <cuda/std/__fwd/new.h>
#include <cuda/std/__new/bad_alloc.h>
#include <cuda/std/__new/device_new.h>
#include <cuda/std/cstddef>

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() && !_CCCL_COMPILER(NVRTC)
# include <new> // for align_val_t
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() !_CCCL_COMPILER(NVRTC)

// clang-cuda only provides device flavors of operator new, so we need to pull in <new> here
#if _CCCL_CUDA_COMPILER(CLANG)
# include <new>
#endif // _CCCL_CUDA_COMPILER(CLANG)

#if !defined(__cpp_sized_deallocation) || __cpp_sized_deallocation < 201309L
# define _LIBCUDACXX_HAS_SIZED_DEALLOCATION() 0
#if __cpp_sized_deallocation >= 201309L
# define _CCCL_HAS_SIZED_DEALLOCATION() 1
#else
# define _LIBCUDACXX_HAS_SIZED_DEALLOCATION() 1
# define _CCCL_HAS_SIZED_DEALLOCATION() 0
#endif

#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

_CCCL_API constexpr bool __is_overaligned_for_new(size_t __align) noexcept
template <class... _Args>
[[nodiscard]] _CCCL_API void* __cccl_operator_new(_Args... __args)
{
#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
return __align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else // ^^^ __STDCPP_DEFAULT_NEW_ALIGNMENT__ ^^^ / vvv !__STDCPP_DEFAULT_NEW_ALIGNMENT__ vvv
return __align > alignof(max_align_t);
#endif // !__STDCPP_DEFAULT_NEW_ALIGNMENT__
return ::operator new(__args...);
}

template <class... _Args>
_CCCL_API inline void* __cccl_operator_new(_Args... __args)
[[nodiscard]] _CCCL_API void* __cccl_operator_new(size_t __size, align_val_t __align, [[maybe_unused]] _Args... __args)
{
// Those builtins are not usable on device and the tests crash when using them
#if defined(_CCCL_BUILTIN_OPERATOR_NEW)
return _CCCL_BUILTIN_OPERATOR_NEW(__args...);
#else // ^^^ _CCCL_BUILTIN_OPERATOR_NEW ^^^ / vvv !_CCCL_BUILTIN_OPERATOR_NEW vvv
return ::operator new(__args...);
#endif // !_CCCL_BUILTIN_OPERATOR_NEW
#if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
void* __ret = ::cuda::std::aligned_alloc(__size, static_cast<size_t>(__align));
if (__ret == nullptr)
{
::cuda::std::__throw_bad_alloc(); // always terminates on device
}
return __ret;
#else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator new(__size, __align, __args...);
#endif // ^^^ other ^^^
}

template <class... _Args>
_CCCL_API inline void __cccl_operator_delete(_Args... __args)
_CCCL_API void __cccl_operator_delete(_Args... __args)
{
// Those builtins are not usable on device and the tests crash when using them
#if defined(_CCCL_BUILTIN_OPERATOR_DELETE)
_CCCL_BUILTIN_OPERATOR_DELETE(__args...);
#else // ^^^ _CCCL_BUILTIN_OPERATOR_DELETE ^^^ / vvv !_CCCL_BUILTIN_OPERATOR_DELETE vvv
::operator delete(__args...);
#endif // !_CCCL_BUILTIN_OPERATOR_DELETE
}

Comment on lines +63 to 67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am highly sceptical that this works across our support matrix. Did you try CTK 12.5 and CTK 12.0?

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
using ::std::align_val_t;
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
template <class... _Args>
_CCCL_API void __cccl_operator_delete(void* __ptr, align_val_t __align, _Args... __args)
{
#if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
::cuda::std::free(__ptr);
#else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator delete(__ptr, __align, __args...);
#endif // ^^^ other ^^^
}

#if _CCCL_HAS_SIZED_DEALLOCATION()
template <class... _Args>
_CCCL_API void __cccl_operator_delete(void* __ptr, size_t __size, align_val_t __align, _Args... __args)
{
# if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
::cuda::std::free(__ptr);
# else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator delete(__ptr, __size, __align, __args...);
# endif // ^^^ other ^^^
}
#endif // _CCCL_HAS_SIZED_DEALLOCATION()

_CCCL_API inline void* __cccl_allocate(size_t __size, [[maybe_unused]] size_t __align)
[[nodiscard]] _CCCL_API inline void* __cccl_allocate(size_t __size, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__cccl_operator_new(__size, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__cccl_operator_new(__size);
}

template <class... _Args>
_CCCL_API inline void __do_deallocate_handle_size(void* __ptr, [[maybe_unused]] size_t __size, _Args... __args)
{
#if _LIBCUDACXX_HAS_SIZED_DEALLOCATION()
#if _CCCL_HAS_SIZED_DEALLOCATION()
return ::cuda::std::__cccl_operator_delete(__ptr, __size, __args...);
#else // ^^^ _LIBCUDACXX_HAS_SIZED_DEALLOCATION() ^^^ / vvv !_LIBCUDACXX_HAS_SIZED_DEALLOCATION() vvv
#else // ^^^ _CCCL_HAS_SIZED_DEALLOCATION() ^^^ / vvv !_CCCL_HAS_SIZED_DEALLOCATION() vvv
return ::cuda::std::__cccl_operator_delete(__ptr, __args...);
#endif // !_LIBCUDACXX_HAS_SIZED_DEALLOCATION()
#endif // !_CCCL_HAS_SIZED_DEALLOCATION()
}

_CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, [[maybe_unused]] size_t __align)
_CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__do_deallocate_handle_size(__ptr, __size, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__do_deallocate_handle_size(__ptr, __size);
}

_CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, [[maybe_unused]] size_t __align)
_CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__cccl_operator_delete(__ptr, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__cccl_operator_delete(__ptr);
}

Expand Down
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__new/bad_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#endif // no system header

#include <cuda/std/__exception/terminate.h>
#include <cuda/std/__fwd/new.h>

#if _CCCL_HAS_EXCEPTIONS()
# include <new>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,6 @@

TEST_DIAG_SUPPRESS_MSVC(4324) // structure was padded due to alignment specifier

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
static const bool UsingAlignedNew = true;
#else
static const bool UsingAlignedNew = false;
#endif

#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
TEST_GLOBAL_VARIABLE const cuda::std::size_t MaxAligned = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else
TEST_GLOBAL_VARIABLE const cuda::std::size_t MaxAligned = cuda::std::alignment_of<cuda::std::max_align_t>::value;
#endif

TEST_GLOBAL_VARIABLE const cuda::std::size_t OverAligned = MaxAligned * 2;

TEST_GLOBAL_VARIABLE int AlignedType_constructed = 0;

template <cuda::std::size_t Align>
Expand Down Expand Up @@ -65,8 +51,8 @@ __host__ __device__ void test_aligned()
AlignedType_constructed = 0;
globalMemCounter.reset();
cuda::std::allocator<T> a;
const bool IsOverAlignedType = Align > MaxAligned;
const bool ExpectAligned = IsOverAlignedType && UsingAlignedNew;
const bool IsOverAlignedType = Align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
const bool ExpectAligned = IsOverAlignedType;
{
assert(globalMemCounter.checkOutstandingNewEq(0));
assert(AlignedType_constructed == 0);
Expand Down Expand Up @@ -114,19 +100,19 @@ int main(int, char**)
test_aligned<4>();
test_aligned<8>();
test_aligned<16>();
test_aligned<MaxAligned>();
test_aligned<OverAligned>();
test_aligned<OverAligned * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>();

#if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION)
static_assert(test_aligned_constexpr<1>());
static_assert(test_aligned_constexpr<2>());
static_assert(test_aligned_constexpr<4>());
static_assert(test_aligned_constexpr<8>());
static_assert(test_aligned_constexpr<16>());
static_assert(test_aligned_constexpr<MaxAligned>());
static_assert(test_aligned_constexpr<OverAligned>());
static_assert(test_aligned_constexpr<OverAligned * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>());
#endif // _CCCL_HAS_CONSTEXPR_ALLOCATION

return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,6 @@

#include "count_new.h"

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
static const bool UsingAlignedNew = true;
#else
static const bool UsingAlignedNew = false;
#endif

#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
static const cuda::std::size_t MaxAligned = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else
static const cuda::std::size_t MaxAligned = cuda::std::alignment_of<cuda::std::max_align_t>::value;
#endif

static const cuda::std::size_t OverAligned = MaxAligned * 2;

template <cuda::std::size_t Align>
struct alignas(Align) AlignedType
{
Expand Down Expand Up @@ -62,8 +48,8 @@ __host__ __device__ void test_aligned()
T::constructed = 0;
globalMemCounter.reset();
cuda::std::allocator<T> a;
const bool IsOverAlignedType = Align > MaxAligned;
const bool ExpectAligned = IsOverAlignedType && UsingAlignedNew;
const bool IsOverAlignedType = Align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
const bool ExpectAligned = IsOverAlignedType;
{
assert(globalMemCounter.checkOutstandingNewEq(0));
assert(T::constructed == 0);
Expand Down Expand Up @@ -107,17 +93,17 @@ int main(int, char**)
test_aligned<4>();
test_aligned<8>();
test_aligned<16>();
test_aligned<MaxAligned>();
test_aligned<OverAligned>();
test_aligned<OverAligned * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>();

static_assert(test_aligned_constexpr<1>());
static_assert(test_aligned_constexpr<2>());
static_assert(test_aligned_constexpr<4>());
static_assert(test_aligned_constexpr<8>());
static_assert(test_aligned_constexpr<16>());
static_assert(test_aligned_constexpr<MaxAligned>());
static_assert(test_aligned_constexpr<OverAligned>());
static_assert(test_aligned_constexpr<OverAligned * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>());
return 0;
}
Loading
Loading