Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions libcudacxx/include/cuda/std/__cccl/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,6 @@
# undef _CCCL_BUILTIN_MEMMOVE
#endif // _CCCL_CUDA_COMPILER(NVCC)

#if _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) \
&& _CCCL_CUDA_COMPILER(CLANG)
# define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__)
# define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete)

#if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);)
#else
Expand Down
48 changes: 48 additions & 0 deletions libcudacxx/include/cuda/std/__fwd/new.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___FWD_NEW_H
#define _CUDA_STD___FWD_NEW_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__cstddef/types.h>

#include <cuda/std/__cccl/prologue.h>

// std:: forward declarations

#if _CCCL_HAS_HOST_STD_LIB()
_CCCL_BEGIN_NAMESPACE_STD

enum class align_val_t : ::cuda::std::size_t;

_CCCL_END_NAMESPACE_STD
#endif // _CCCL_HAS_HOST_STD_LIB()

// cuda::std:: forward declarations

_CCCL_BEGIN_NAMESPACE_CUDA_STD

using ::std::align_val_t;

_CCCL_END_NAMESPACE_CUDA_STD

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA_STD___FWD_NEW_H
6 changes: 0 additions & 6 deletions libcudacxx/include/cuda/std/__internal/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@
#define _LIBCUDACXX_HAS_MONOTONIC_CLOCK() 0
#define _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() 0

#if _CCCL_CUDA_COMPILATION() || __cpp_aligned_new < 201606
# define _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() 0
#else
# define _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() 1
#endif // !_CCCL_CUDA_COMPILATION() && __cpp_aligned_new >= 201606

// We need `is_constant_evaluated` for clang and gcc. MSVC also needs extensive rework
#if !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
# define _LIBCUDACXX_HAS_CONSTEXPR_COMPLEX_OPERATIONS() 0
Expand Down
13 changes: 1 addition & 12 deletions libcudacxx/include/cuda/std/__memory/temporary_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ template <class _Tp>
}
while (__n > 0)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (__is_overaligned_for_new(alignof(_Tp)))
if constexpr (alignof(_Tp) > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
::cuda::std::align_val_t __al = ::cuda::std::align_val_t(::cuda::std::alignment_of<_Tp>::value);
__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp), __al));
Expand All @@ -56,16 +55,6 @@ template <class _Tp>
{
__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
}
#else // ^^^ _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() ^^^ / vvv !_LIBCUDACXX_HAS_ALIGNED_ALLOCATION() vvv
if (__is_overaligned_for_new(alignof(_Tp)))
{
// Since aligned operator new is unavailable, return an empty
// buffer rather than one with invalid alignment.
return __r;
}

__r.first = static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
#endif // !_LIBCUDACXX_HAS_ALIGNED_ALLOCATION()

if (__r.first)
{
Expand Down
101 changes: 52 additions & 49 deletions libcudacxx/include/cuda/std/__new/allocate.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,105 +22,108 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__cstdlib/aligned_alloc.h>
#include <cuda/std/__fwd/new.h>
#include <cuda/std/__new/bad_alloc.h>
#include <cuda/std/__new/device_new.h>
#include <cuda/std/cstddef>

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() && !_CCCL_COMPILER(NVRTC)
# include <new> // for align_val_t
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() !_CCCL_COMPILER(NVRTC)

// clang-cuda only provides device flavors of operator new, so we need to pull in <new> here
#if _CCCL_CUDA_COMPILER(CLANG)
# include <new>
#endif // _CCCL_CUDA_COMPILER(CLANG)

#if !defined(__cpp_sized_deallocation) || __cpp_sized_deallocation < 201309L
# define _LIBCUDACXX_HAS_SIZED_DEALLOCATION() 0
#if __cpp_sized_deallocation >= 201309L
# define _CCCL_HAS_SIZED_DEALLOCATION() 1
#else
# define _LIBCUDACXX_HAS_SIZED_DEALLOCATION() 1
# define _CCCL_HAS_SIZED_DEALLOCATION() 0
#endif

#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

_CCCL_API constexpr bool __is_overaligned_for_new(size_t __align) noexcept
template <class... _Args>
[[nodiscard]] _CCCL_API void* __cccl_operator_new(_Args... __args)
{
#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
return __align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else // ^^^ __STDCPP_DEFAULT_NEW_ALIGNMENT__ ^^^ / vvv !__STDCPP_DEFAULT_NEW_ALIGNMENT__ vvv
return __align > alignof(max_align_t);
#endif // !__STDCPP_DEFAULT_NEW_ALIGNMENT__
return ::operator new(__args...);
}

template <class... _Args>
_CCCL_API inline void* __cccl_operator_new(_Args... __args)
[[nodiscard]] _CCCL_API void* __cccl_operator_new(size_t __size, align_val_t __align, [[maybe_unused]] _Args... __args)
{
// Those builtins are not usable on device and the tests crash when using them
#if defined(_CCCL_BUILTIN_OPERATOR_NEW)
return _CCCL_BUILTIN_OPERATOR_NEW(__args...);
#else // ^^^ _CCCL_BUILTIN_OPERATOR_NEW ^^^ / vvv !_CCCL_BUILTIN_OPERATOR_NEW vvv
return ::operator new(__args...);
#endif // !_CCCL_BUILTIN_OPERATOR_NEW
#if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
void* __ret = ::cuda::std::aligned_alloc(__size, static_cast<size_t>(__align));
if (__ret == nullptr)
{
::cuda::std::__throw_bad_alloc(); // always terminates on device
}
return __ret;
#else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator new(__size, __align, __args...);
#endif // ^^^ other ^^^
}

template <class... _Args>
_CCCL_API inline void __cccl_operator_delete(_Args... __args)
_CCCL_API void __cccl_operator_delete(_Args... __args)
{
// Those builtins are not usable on device and the tests crash when using them
#if defined(_CCCL_BUILTIN_OPERATOR_DELETE)
_CCCL_BUILTIN_OPERATOR_DELETE(__args...);
#else // ^^^ _CCCL_BUILTIN_OPERATOR_DELETE ^^^ / vvv !_CCCL_BUILTIN_OPERATOR_DELETE vvv
::operator delete(__args...);
#endif // !_CCCL_BUILTIN_OPERATOR_DELETE
}

Comment on lines +63 to 67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am highly sceptical that this works across our support matrix. Did you try CTK 12.5 and CTK 12.0?

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
using ::std::align_val_t;
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
#if _CCCL_HAS_SIZED_DEALLOCATION()
template <class... _Args>
_CCCL_API void __cccl_operator_delete(void* __ptr, size_t __size, align_val_t __align, _Args... __args)
{
# if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
::cuda::std::free(__ptr);
# else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator delete(__ptr, __size, __align, __args...);
# endif // ^^^ other ^^^
}
#else // ^^^ _CCCL_HAS_SIZED_DEALLOCATION() ^^^ / vvv !_CCCL_HAS_SIZED_DEALLOCATION() vvv
template <class... _Args>
_CCCL_API void __cccl_operator_delete(void* __ptr, align_val_t __align, _Args... __args)
{
# if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
::cuda::std::free(__ptr);
# else // ^^^ clang-cuda in device mode ^^^ / vvv other vvv
return ::operator delete(__ptr, __align, __args...);
# endif // ^^^ other ^^^
}
#endif // ^^^ !_CCCL_HAS_SIZED_DEALLOCATION() ^^^

_CCCL_API inline void* __cccl_allocate(size_t __size, [[maybe_unused]] size_t __align)
[[nodiscard]] _CCCL_API inline void* __cccl_allocate(size_t __size, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__cccl_operator_new(__size, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__cccl_operator_new(__size);
}

template <class... _Args>
_CCCL_API inline void __do_deallocate_handle_size(void* __ptr, [[maybe_unused]] size_t __size, _Args... __args)
{
#if _LIBCUDACXX_HAS_SIZED_DEALLOCATION()
#if _CCCL_HAS_SIZED_DEALLOCATION()
return ::cuda::std::__cccl_operator_delete(__ptr, __size, __args...);
#else // ^^^ _LIBCUDACXX_HAS_SIZED_DEALLOCATION() ^^^ / vvv !_LIBCUDACXX_HAS_SIZED_DEALLOCATION() vvv
#else // ^^^ _CCCL_HAS_SIZED_DEALLOCATION() ^^^ / vvv !_CCCL_HAS_SIZED_DEALLOCATION() vvv
return ::cuda::std::__cccl_operator_delete(__ptr, __args...);
#endif // !_LIBCUDACXX_HAS_SIZED_DEALLOCATION()
#endif // !_CCCL_HAS_SIZED_DEALLOCATION()
}

_CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, [[maybe_unused]] size_t __align)
_CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__do_deallocate_handle_size(__ptr, __size, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__do_deallocate_handle_size(__ptr, __size);
}

_CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, [[maybe_unused]] size_t __align)
_CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, size_t __align)
{
#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
if (::cuda::std::__is_overaligned_for_new(__align))
if (__align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
{
const align_val_t __align_val = static_cast<align_val_t>(__align);
return ::cuda::std::__cccl_operator_delete(__ptr, __align_val);
}
#endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
return ::cuda::std::__cccl_operator_delete(__ptr);
}

Expand Down
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__new/bad_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#endif // no system header

#include <cuda/std/__exception/terminate.h>
#include <cuda/std/__fwd/new.h>

#if _CCCL_HAS_EXCEPTIONS()
# include <new>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,6 @@

TEST_DIAG_SUPPRESS_MSVC(4324) // structure was padded due to alignment specifier

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
static const bool UsingAlignedNew = true;
#else
static const bool UsingAlignedNew = false;
#endif

#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
TEST_GLOBAL_VARIABLE const cuda::std::size_t MaxAligned = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else
TEST_GLOBAL_VARIABLE const cuda::std::size_t MaxAligned = cuda::std::alignment_of<cuda::std::max_align_t>::value;
#endif

TEST_GLOBAL_VARIABLE const cuda::std::size_t OverAligned = MaxAligned * 2;

TEST_GLOBAL_VARIABLE int AlignedType_constructed = 0;

template <cuda::std::size_t Align>
Expand Down Expand Up @@ -65,8 +51,8 @@ __host__ __device__ void test_aligned()
AlignedType_constructed = 0;
globalMemCounter.reset();
cuda::std::allocator<T> a;
const bool IsOverAlignedType = Align > MaxAligned;
const bool ExpectAligned = IsOverAlignedType && UsingAlignedNew;
const bool IsOverAlignedType = Align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
const bool ExpectAligned = IsOverAlignedType;
{
assert(globalMemCounter.checkOutstandingNewEq(0));
assert(AlignedType_constructed == 0);
Expand Down Expand Up @@ -114,19 +100,19 @@ int main(int, char**)
test_aligned<4>();
test_aligned<8>();
test_aligned<16>();
test_aligned<MaxAligned>();
test_aligned<OverAligned>();
test_aligned<OverAligned * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>();

#if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION)
static_assert(test_aligned_constexpr<1>());
static_assert(test_aligned_constexpr<2>());
static_assert(test_aligned_constexpr<4>());
static_assert(test_aligned_constexpr<8>());
static_assert(test_aligned_constexpr<16>());
static_assert(test_aligned_constexpr<MaxAligned>());
static_assert(test_aligned_constexpr<OverAligned>());
static_assert(test_aligned_constexpr<OverAligned * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>());
#endif // _CCCL_HAS_CONSTEXPR_ALLOCATION

return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,6 @@

#include "count_new.h"

#if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
static const bool UsingAlignedNew = true;
#else
static const bool UsingAlignedNew = false;
#endif

#ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
static const cuda::std::size_t MaxAligned = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
#else
static const cuda::std::size_t MaxAligned = cuda::std::alignment_of<cuda::std::max_align_t>::value;
#endif

static const cuda::std::size_t OverAligned = MaxAligned * 2;

template <cuda::std::size_t Align>
struct alignas(Align) AlignedType
{
Expand Down Expand Up @@ -62,8 +48,8 @@ __host__ __device__ void test_aligned()
T::constructed = 0;
globalMemCounter.reset();
cuda::std::allocator<T> a;
const bool IsOverAlignedType = Align > MaxAligned;
const bool ExpectAligned = IsOverAlignedType && UsingAlignedNew;
const bool IsOverAlignedType = Align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
const bool ExpectAligned = IsOverAlignedType;
{
assert(globalMemCounter.checkOutstandingNewEq(0));
assert(T::constructed == 0);
Expand Down Expand Up @@ -107,17 +93,17 @@ int main(int, char**)
test_aligned<4>();
test_aligned<8>();
test_aligned<16>();
test_aligned<MaxAligned>();
test_aligned<OverAligned>();
test_aligned<OverAligned * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>();
test_aligned<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>();

static_assert(test_aligned_constexpr<1>());
static_assert(test_aligned_constexpr<2>());
static_assert(test_aligned_constexpr<4>());
static_assert(test_aligned_constexpr<8>());
static_assert(test_aligned_constexpr<16>());
static_assert(test_aligned_constexpr<MaxAligned>());
static_assert(test_aligned_constexpr<OverAligned>());
static_assert(test_aligned_constexpr<OverAligned * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2>());
static_assert(test_aligned_constexpr<__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 4>());
return 0;
}
Loading
Loading