diff --git a/docs/libcudacxx/extended_api/mdspan.rst b/docs/libcudacxx/extended_api/mdspan.rst index 028ac918aec..ca0582fa0d5 100644 --- a/docs/libcudacxx/extended_api/mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan.rst @@ -10,6 +10,7 @@ Mdspan mdspan/host_device_accessor mdspan/restrict_accessor mdspan/shared_memory_accessor + mdspan/mdspan_to_dlpack .. list-table:: :widths: 25 45 30 30 @@ -34,3 +35,8 @@ Mdspan - ``mdspan`` and accessor for CUDA shared memory - CCCL 3.2.0 - CUDA 13.2 + + * - :ref:`mdspan to dlpack ` + - Convert a ``mdspan`` to a ``DLTensor`` + - CCCL 3.2.0 + - CUDA 13.2 diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst new file mode 100644 index 00000000000..1cf91c50ef8 --- /dev/null +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -0,0 +1,140 @@ +.. _libcudacxx-extended-api-mdspan-mdspan-to-dlpack: + +``mdspan`` to DLPack +==================== + +This functionality provides a conversion from ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan`` to `DLPack `__ ``DLTensor`` view. + +Defined in the ```` header. + +Conversion functions +-------------------- + +.. code:: cuda + + namespace cuda { + + template + [[nodiscard]] __dlpack_tensor + to_dlpack_tensor(const cuda::host_mdspan& mdspan); + + template + [[nodiscard]] __dlpack_tensor + to_dlpack_tensor(const cuda::device_mdspan& mdspan); + + template + [[nodiscard]] __dlpack_tensor + to_dlpack_tensor(const cuda::managed_mdspan& mdspan); + + } // namespace cuda + +Types +----- + +``__dlpack_tensor`` is an internal class that stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. + +.. code:: cuda + + namespace cuda { + + template + class __dlpack_tensor { + public: + __dlpack_tensor() noexcept; + __dlpack_tensor(const __dlpack_tensor&) noexcept; + __dlpack_tensor(__dlpack_tensor&&) noexcept; + __dlpack_tensor& operator=(const __dlpack_tensor&) noexcept; + __dlpack_tensor& operator=(__dlpack_tensor&&) noexcept; + ~__dlpack_tensor() noexcept = default; + + DLTensor& get() & noexcept [[lifetimebound]]; + const DLTensor& get() & const noexcept [[lifetimebound]]; + + DLTensor& get() && noexcept = delete; + const DLTensor& get() && const noexcept = delete; + }; + + } // namespace cuda + +``cuda::__dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. + +.. note:: **Lifetime** + + The ``DLTensor`` associated with ``cuda::__dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed or moved, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. + +.. note:: **Const-correctness** + + ``DLTensor::data`` points at ``mdspan.data_handle()`` (or is ``nullptr`` if ``mdspan.size() == 0``). If ``T`` is ``const``, the pointer is ``const_cast``'d because ``DLTensor::data`` is unqualified. + +Semantics +--------- + +The conversion produces a non-owning DLPack view of the ``mdspan`` data and metadata: + +- ``DLTensor::ndim`` is ``mdspan.rank()``. +- For rank > 0, ``DLTensor::shape[i]`` is ``mdspan.extent(i)``. +- For rank > 0, ``DLTensor::strides[i]`` is ``mdspan.stride(i)``. +- ``DLTensor::byte_offset`` is always ``0``. +- ``DLTensor::device`` is: + + - ``{kDLCPU, 0}`` for ``cuda::host_mdspan`` + - ``{kDLCUDA, /*device_id*/}`` for ``cuda::device_mdspan`` + - ``{kDLCUDAManaged, 0}`` for ``cuda::managed_mdspan`` + +Element types are mapped to ``DLDataType`` according to the DLPack conventions, including: + +- ``bool``. +- Signed and unsigned integers. +- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, ``__float128``, FP8, FP6, FP4 when available. +- Complex: ``cuda::std::complex<__half>``, ``cuda::std::complex``, and ``cuda::std::complex``. +- `CUDA built-in vector types `__, such as ``int2``, ``float4``, etc. +- Vector types for extended floating-point, such as ``__half2``, ``__nv_fp8x4_e4m3``, etc. + +Constraints +----------- + +- The accessor ``data_handle_type`` must be a pointer type. + +Runtime errors +-------------- + +- If any ``extent(i)`` or ``stride(i)`` cannot be represented in ``int64_t``, the conversion raises an ``std::invalid_argument`` exception. + +Availability notes +------------------ + +- This API is available only when DLPack header is present, namely ```` is found in the include path. +- This API can be disabled by defining ``CCCL_DISABLE_DLPACK`` before including any library headers. In this case, ```` will not be included. + +References +---------- + +- `DLPack C API `__ documentation. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + int main() { + using extents_t = cuda::std::extents; + + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + + auto dl = cuda::to_dlpack_tensor(md); + const auto& dltensor = dl.get(); + // auto dltensor = dl.get(); is incorrect; it returns a reference to a temporary object that will be destroyed at the end of the statement. + + // `dl` owns the shape/stride storage; `dltensor.data` is a non-owning pointer to `data`. + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.ndim == 2); + assert(dltensor.shape[0] == 2 && dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3 && dltensor.strides[1] == 1); + assert(dltensor.data == data); + } diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h new file mode 100644 index 00000000000..b6ed93f4b53 --- /dev/null +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -0,0 +1,311 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H +#define _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_DLPACK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +template +[[nodiscard]] _CCCL_HOST_API inline ::DLDataType __data_type_to_dlpack() noexcept +{ + if constexpr (::cuda::std::is_same_v<_ElementType, bool>) + { + return ::DLDataType{::kDLBool, 8, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // Signed integer types + else if constexpr (::cuda::std::__cccl_is_integer_v<_ElementType>) + { + return ::DLDataType{ + (::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // bfloat16 (must come before general floating-point) +# if _CCCL_HAS_NVBF16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_bfloat16>) + { + return ::DLDataType{::kDLBfloat, 16, 1}; + } +# endif // _CCCL_HAS_NVBF16() + //-------------------------------------------------------------------------------------------------------------------- + // Low-precision Floating-point types (must come before general floating-point) +# if _CCCL_HAS_NVFP8_E4M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>) + { + return ::DLDataType{::kDLFloat8_e4m3fn, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E4M3() +# if _CCCL_HAS_NVFP8_E5M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e5m2>) + { + return ::DLDataType{::kDLFloat8_e5m2, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E5M2() +# if _CCCL_HAS_NVFP8_E8M0() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e8m0>) + { + return ::DLDataType{::kDLFloat8_e8m0fnu, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E8M0() +# if _CCCL_HAS_NVFP6_E2M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e2m3>) + { + return ::DLDataType{::kDLFloat6_e2m3fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E2M3() +# if _CCCL_HAS_NVFP6_E3M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e3m2>) + { + return ::DLDataType{::kDLFloat6_e3m2fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E3M2() +# if _CCCL_HAS_NVFP4_E2M1() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp4_e2m1>) + { + return ::DLDataType{::kDLFloat4_e2m1fn, 4, 1}; + } +# endif // _CCCL_HAS_NVFP4_E2M1() + //-------------------------------------------------------------------------------------------------------------------- + // Floating-point types (after specific types) + else if constexpr (::cuda::is_floating_point_v<_ElementType>) + { + return ::DLDataType{::kDLFloat, ::cuda::std::__num_bits_v<_ElementType>, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // Complex types + // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128> + else if constexpr (::cuda::std::__is_cuda_std_complex_v<_ElementType> && sizeof(_ElementType) <= sizeof(double) * 2) + { + // DLPack encodes complex numbers as a compact struct of two scalar values, and `bits` stores + // the size of the full complex number (e.g. std::complex => bits=64). + return ::DLDataType{::kDLComplex, sizeof(_ElementType) * CHAR_BIT, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // CUDA built-in vector types +# if _CCCL_HAS_CTK() + else if constexpr (::cuda::__is_vector_type_v<_ElementType> || ::cuda::__is_extended_fp_vector_type_v<_ElementType>) + { + constexpr ::cuda::std::uint16_t __lanes = ::cuda::std::tuple_size_v<_ElementType>; + if constexpr (__lanes == 2 || __lanes == 4) + { + using __scalar_t = ::cuda::std::remove_cv_t<::cuda::std::tuple_element_t<0, _ElementType>>; + auto __scalar = ::cuda::__data_type_to_dlpack<__scalar_t>(); + __scalar.lanes = __lanes; + return __scalar; + } + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported vector type"); + return ::DLDataType{}; + } + } +# endif // _CCCL_HAS_CTK() + //-------------------------------------------------------------------------------------------------------------------- + // Unsupported types + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); + return ::DLDataType{}; + } +} + +template <::cuda::std::size_t _Rank> +class __dlpack_tensor +{ + ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{}; + ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{}; + ::DLTensor __tensor{}; + + _CCCL_HOST_API void __update_tensor() noexcept + { + __tensor.shape = _Rank > 0 ? __shape.data() : nullptr; + __tensor.strides = _Rank > 0 ? __strides.data() : nullptr; + } + +public: + _CCCL_HOST_API explicit __dlpack_tensor() noexcept + { + __update_tensor(); + } + + _CCCL_HOST_API __dlpack_tensor(const __dlpack_tensor& __other) noexcept + : __shape{__other.__shape} + , __strides{__other.__strides} + , __tensor{__other.__tensor} + { + __update_tensor(); + } + + _CCCL_HOST_API __dlpack_tensor(__dlpack_tensor&& __other) noexcept + : __shape{::cuda::std::move(__other.__shape)} + , __strides{::cuda::std::move(__other.__strides)} + , __tensor{__other.__tensor} + { + __other.__tensor = ::DLTensor{}; + __update_tensor(); + } + + _CCCL_HOST_API __dlpack_tensor& operator=(const __dlpack_tensor& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = __other.__shape; + __strides = __other.__strides; + __tensor = __other.__tensor; + __update_tensor(); + return *this; + } + + _CCCL_HOST_API __dlpack_tensor& operator=(__dlpack_tensor&& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = ::cuda::std::move(__other.__shape); + __strides = ::cuda::std::move(__other.__strides); + __tensor = __other.__tensor; + __other.__tensor = ::DLTensor{}; + __update_tensor(); + return *this; + } + + _CCCL_HIDE_FROM_ABI ~__dlpack_tensor() noexcept = default; + + [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() & noexcept _CCCL_LIFETIMEBOUND + { + return __tensor; + } + + [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() && noexcept = delete; + + [[nodiscard]] _CCCL_HOST_API const ::DLTensor& get() const& noexcept _CCCL_LIFETIMEBOUND + { + return __tensor; + } + + [[nodiscard]] _CCCL_HOST_API const ::DLTensor& get() const&& noexcept = delete; +}; + +template +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> +__to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::DLDeviceType __device_type, + int __device_id) +{ + static_assert(::cuda::std::is_pointer_v, "data_handle_type must be a pointer"); + using __element_type = ::cuda::std::remove_cv_t<_ElementType>; + __dlpack_tensor<_Extents::rank()> __wrapper{}; + auto& __tensor = __wrapper.get(); + __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; + __tensor.device = ::DLDevice{__device_type, __device_id}; + __tensor.ndim = static_cast(__mdspan.rank()); + __tensor.dtype = ::cuda::__data_type_to_dlpack<::cuda::std::remove_cv_t<_ElementType>>(); + if constexpr (_Extents::rank() > 0) + { + constexpr auto __max_extent = ::cuda::std::numeric_limits<::cuda::std::int64_t>::max(); + for (::cuda::std::size_t __i = 0; __i < __mdspan.rank(); ++__i) + { + if (::cuda::std::cmp_greater(__mdspan.extent(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Extent is too large"}); + } + if (::cuda::std::cmp_greater(__mdspan.stride(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Stride is too large"}); + } + __tensor.shape[__i] = static_cast<::cuda::std::int64_t>(__mdspan.extent(__i)); + __tensor.strides[__i] = static_cast<::cuda::std::int64_t>(__mdspan.stride(__i)); + } + } + __tensor.byte_offset = 0; + return __wrapper; +} + +/*********************************************************************************************************************** + * Public API + **********************************************************************************************************************/ + +template +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> +to_dlpack_tensor(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0); +} + +template +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> +to_dlpack_tensor(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + ::CUpointer_attribute __attrs[1] = {::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL}; + int __ptr_dev_id = 0; + void* __results[1] = {&__ptr_dev_id}; + const auto __status = ::cuda::__driver::__pointerGetAttributesNoThrow(__attrs, __results, __mdspan.data_handle()); + if (__status != ::cudaSuccess) + { + _CCCL_THROW(::std::invalid_argument{"Failed to get device ordinal of a pointer"}); + } + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __ptr_dev_id); +} + +template +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> +to_dlpack_tensor(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0); +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // _CCCL_HAS_DLPACK() +#endif // _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H diff --git a/libcudacxx/include/cuda/__type_traits/vector_type.h b/libcudacxx/include/cuda/__type_traits/vector_type.h index 7e21b8a806e..59badd8162c 100644 --- a/libcudacxx/include/cuda/__type_traits/vector_type.h +++ b/libcudacxx/include/cuda/__type_traits/vector_type.h @@ -24,7 +24,6 @@ #if _CCCL_HAS_CTK() # include -# include # include # if !_CCCL_CUDA_COMPILATION() @@ -340,16 +339,204 @@ template } } +_CCCL_SUPPRESS_DEPRECATED_PUSH + template using __vector_type_t = decltype(::cuda::__cccl_vector_type_t_impl<_Tp, _Size>()); template inline constexpr bool __has_vector_type_v = !::cuda::std::is_same_v<__vector_type_t<_Tp, _Size>, void>; +template +inline constexpr bool __is_vector_type_v = false; + +template <> +inline constexpr bool __is_vector_type_v<::char1> = true; +template <> +inline constexpr bool __is_vector_type_v<::char2> = true; +template <> +inline constexpr bool __is_vector_type_v<::char3> = true; +template <> +inline constexpr bool __is_vector_type_v<::char4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::uchar1> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar2> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar3> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::short1> = true; +template <> +inline constexpr bool __is_vector_type_v<::short2> = true; +template <> +inline constexpr bool __is_vector_type_v<::short3> = true; +template <> +inline constexpr bool __is_vector_type_v<::short4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::ushort1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort3> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::int1> = true; +template <> +inline constexpr bool __is_vector_type_v<::int2> = true; +template <> +inline constexpr bool __is_vector_type_v<::int3> = true; +template <> +inline constexpr bool __is_vector_type_v<::int4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::uint1> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint2> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint3> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::long1> = true; +template <> +inline constexpr bool __is_vector_type_v<::long2> = true; +template <> +inline constexpr bool __is_vector_type_v<::long3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::long4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::long4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::long4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::ulong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulong4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::longlong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::longlong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::longlong4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::ulonglong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::float1> = true; +template <> +inline constexpr bool __is_vector_type_v<::float2> = true; +template <> +inline constexpr bool __is_vector_type_v<::float3> = true; +template <> +inline constexpr bool __is_vector_type_v<::float4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::double1> = true; +template <> +inline constexpr bool __is_vector_type_v<::double2> = true; +template <> +inline constexpr bool __is_vector_type_v<::double3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::double4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::double4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::double4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::dim3> = true; + +template +inline constexpr bool __is_extended_fp_vector_type_v = false; + +# if _CCCL_HAS_NVFP8() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_bfloat162> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__half2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e4m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e5m2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e4m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e5m2> = true; +# if _CCCL_CTK_AT_LEAST(12, 8) +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e8m0> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e8m0> = true; +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() + +# if _CCCL_HAS_NVFP6() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e2m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e3m2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e2m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e3m2> = true; +# endif // _CCCL_HAS_NVFP6() + +# if _CCCL_HAS_NVFP4() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x2_e2m1> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x4_e2m1> = true; +# endif // _CCCL_HAS_NVFP4() + +_CCCL_SUPPRESS_DEPRECATED_POP _CCCL_END_NAMESPACE_CUDA # include #endif // !_CCCL_HAS_CTK() - #endif // _CUDA__TYPE_TRAITS_VECTOR_TYPE_H diff --git a/libcudacxx/include/cuda/mdspan b/libcudacxx/include/cuda/mdspan index ae81a30219a..3129198d02a 100644 --- a/libcudacxx/include/cuda/mdspan +++ b/libcudacxx/include/cuda/mdspan @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h index a44443ed091..125b6b87492 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h @@ -121,6 +121,26 @@ _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__half, __half, 2) # if _CCCL_HAS_NVBF16() _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_bfloat16, __nv_bfloat16, 2) # endif // _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVFP8() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 2, _e5m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 4, _e5m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 2, _e4m3) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 4, _e4m3) +# if _CCCL_CTK_AT_LEAST(12, 8) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 2, _e8m0) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 4, _e8m0) +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() +# if _CCCL_HAS_NVFP6() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 2, _e3m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 4, _e3m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 2, _e2m3) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 4, _e2m3) +# endif // _CCCL_HAS_NVFP6() +# if _CCCL_HAS_NVFP4() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 2, _e2m1) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 4, _e2m1) +# endif // _CCCL_HAS_NVFP4() template struct __get_element; @@ -273,6 +293,26 @@ _LIBCUDACXX_SPECIALIZE_GET(__half2, __half) # if _CCCL_HAS_NVBF16() _LIBCUDACXX_SPECIALIZE_GET(__nv_bfloat162, __nv_bfloat16) # endif // _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVFP8() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e5m2, __nv_fp8_e5m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e5m2, __nv_fp8_e5m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e4m3, __nv_fp8_e4m3) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e4m3, __nv_fp8_e4m3) +# if _CCCL_CTK_AT_LEAST(12, 8) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e8m0, __nv_fp8_e8m0) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e8m0, __nv_fp8_e8m0) +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() +# if _CCCL_HAS_NVFP6() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e3m2, __nv_fp6_e3m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e3m2, __nv_fp6_e3m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e2m3, __nv_fp6_e2m3) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e2m3, __nv_fp6_e2m3) +# endif // _CCCL_HAS_NVFP6() +# if _CCCL_HAS_NVFP4() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x2_e2m1, __nv_fp4_e2m1) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x4_e2m1, __nv_fp4_e2m1) +# endif // _CCCL_HAS_NVFP4() _CCCL_END_NAMESPACE_CUDA_STD diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp new file mode 100644 index 00000000000..955a75c85ab --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp @@ -0,0 +1,527 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes) +{ + assert(dt.code == code); + assert(dt.bits == bits); + assert(dt.lanes == lanes); +} + +bool test_mdspan_to_dlpack_host_layout_right() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.shape != nullptr); + assert(dltensor.strides != nullptr); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); + return true; +} + +bool test_mdspan_to_dlpack_host_layout_left() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 2); + assert(dltensor.shape != nullptr); + assert(dltensor.strides != nullptr); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 1); + assert(dltensor.strides[1] == 2); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); + return true; +} + +bool test_mdspan_to_dlpack_empty_size() +{ + using extents_t = cuda::std::dims<2>; + int data[1] = {42}; + cuda::host_mdspan m{data, extents_t{0, 3}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(m); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 2); + assert(dltensor.shape[0] == 0); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == nullptr); // size() == 0 => nullptr + return true; +} + +bool test_mdspan_to_dlpack_rank_0() +{ + using extents_t = cuda::std::extents; + int data[1] = {7}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 0); + assert(dltensor.shape == nullptr); + assert(dltensor.strides == nullptr); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); // rank-0 mdspan has size() == 1 + return true; +} + +bool test_mdspan_to_dlpack_const_pointer() +{ + using extents_t = cuda::std::dims<3>; + const int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{2, 3, 4}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 3); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.shape[2] == 4); + assert(dltensor.strides[0] == 12); + assert(dltensor.strides[1] == 4); + assert(dltensor.strides[2] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); // rank-0 mdspan has size() == 1 + return true; +} + +bool test_mdspan_to_dlpack_device() +{ + using extents_t = cuda::std::extents; + float* data = nullptr; + assert(cudaMalloc(&data, 6 * sizeof(float)) == cudaSuccess); + cuda::device_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCUDA); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLFloat, 32, 1); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); + return true; +} + +bool test_mdspan_to_dlpack_managed() +{ + using extents_t = cuda::std::extents; + float* data = nullptr; + assert(cudaMallocManaged(&data, 6 * sizeof(float)) == cudaSuccess); + cuda::managed_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + assert(dltensor.device.device_type == kDLCUDAManaged); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLFloat, 32, 1); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); + return true; +} + +template +struct test_mdspan_to_dlpack_types_fn +{ + using list_t = ListT; + + cuda::std::array expected_types; + + template + void call_impl() const + { + using T = cuda::std::__type_at_c; + using extents_t = cuda::std::extents; + T data[6] = {}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::to_dlpack_tensor(md); + const auto& dltensor = dlpack_wrapper.get(); + + auto type = expected_types[index]; + check_datatype(dltensor.dtype, type.code, type.bits, type.lanes); + } + + template + void call(cuda::std::index_sequence) const + { + (call_impl(), ...); + } +}; + +bool test_mdspan_to_dlpack_basic_types() +{ + using list_t = cuda::std::__type_list< + bool, + signed char, + short, + int, + long, + long long, +#if _CCCL_HAS_INT128() + __int128_t, +#endif + // Unsigned integer types + unsigned char, + unsigned short, + unsigned int, + unsigned long, + unsigned long long, +#if _CCCL_HAS_INT128() + __uint128_t, +#endif + // Floating-point types + float, + double +#if _CCCL_HAS_FLOAT128() + , + __float128 +#endif + >; + cuda::std::array expected_types = { + DLDataType{kDLBool, 8, 1}, + // Signed integer types + DLDataType{kDLInt, 8, 1}, + DLDataType{kDLInt, 16, 1}, + DLDataType{kDLInt, 32, 1}, + DLDataType{kDLInt, sizeof(long) * 8, 1}, + DLDataType{kDLInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLInt, 128, 1}, +#endif + // Unsigned integer types + DLDataType{kDLUInt, 8, 1}, + DLDataType{kDLUInt, 16, 1}, + DLDataType{kDLUInt, 32, 1}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 1}, + DLDataType{kDLUInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLUInt, 128, 1}, +#endif + // Floating-point types + DLDataType{kDLFloat, 32, 1}, + DLDataType{kDLFloat, 64, 1}, +#if _CCCL_HAS_FLOAT128() + DLDataType{kDLFloat, 128, 1}, +#endif + }; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + +bool test_mdspan_to_dlpack_extended_fp_and_complex_types() +{ + using list_t = cuda::std::__type_list< +#if _CCCL_HAS_NVFP16() + ::__half, +#endif +#if _CCCL_HAS_NVBF16() + ::__nv_bfloat16, +#endif +#if _CCCL_HAS_FLOAT128() + __float128, +#endif + // Low-precision floating-point types +#if _CCCL_HAS_NVFP8_E4M3() + ::__nv_fp8_e4m3, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + ::__nv_fp8_e5m2, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + ::__nv_fp8_e8m0, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + ::__nv_fp6_e2m3, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + ::__nv_fp6_e3m2, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + ::__nv_fp4_e2m1, +#endif + // Complex types +#if _CCCL_HAS_NVFP16() + cuda::std::complex<::__half>, +#endif + cuda::std::complex, + cuda::std::complex>; + cuda::std::array expected_types = { +#if _CCCL_HAS_NVFP16() + DLDataType{kDLFloat, 16, 1}, +#endif +#if _CCCL_HAS_NVBF16() + DLDataType{kDLBfloat, 16, 1}, +#endif +#if _CCCL_HAS_FLOAT128() + DLDataType{kDLFloat, 128, 1}, +#endif + // Low-precision floating-point types +#if _CCCL_HAS_NVFP8_E4M3() + DLDataType{kDLFloat8_e4m3fn, 8, 1}, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + DLDataType{kDLFloat8_e5m2, 8, 1}, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + DLDataType{kDLFloat8_e8m0fnu, 8, 1}, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + DLDataType{kDLFloat6_e2m3fn, 6, 1}, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + DLDataType{kDLFloat6_e3m2fn, 6, 1}, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + DLDataType{kDLFloat4_e2m1fn, 4, 1}, +#endif + // Complex types +#if _CCCL_HAS_NVFP16() + DLDataType{kDLComplex, 32, 1}, +#endif + DLDataType{kDLComplex, 64, 1}, + DLDataType{kDLComplex, 128, 1}}; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + +#if _CCCL_HAS_CTK() +bool test_mdspan_to_dlpack_vector_types() +{ + using list_t = cuda::std::__type_list< + ::char2, + ::char4, + ::uchar2, + ::uchar4, + ::short2, + ::short4, + ::ushort2, + ::ushort4, + ::int2, + ::int4, + ::uint2, + ::uint4, + ::long2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::long4_32a +# else + , + ::long4 +# endif + , + ::ulong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::ulong4_32a +# else + , + ::ulong4 +# endif + , + ::longlong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::longlong4_32a +# else + , + ::longlong4 +# endif + , + ::ulonglong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::ulonglong4_32a +# else + , + ::ulonglong4 +# endif + , + ::float2, + ::float4, + ::double2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::double4_32a +# else + , + ::double4 +# endif + >; + cuda::std::array expected_types = { + DLDataType{kDLInt, 8, 2}, + DLDataType{kDLInt, 8, 4}, + DLDataType{kDLUInt, 8, 2}, + DLDataType{kDLUInt, 8, 4}, + DLDataType{kDLInt, 16, 2}, + DLDataType{kDLInt, 16, 4}, + DLDataType{kDLUInt, 16, 2}, + DLDataType{kDLUInt, 16, 4}, + DLDataType{kDLInt, 32, 2}, + DLDataType{kDLInt, 32, 4}, + DLDataType{kDLUInt, 32, 2}, + DLDataType{kDLUInt, 32, 4}, + DLDataType{kDLInt, sizeof(long) * 8, 2}, + DLDataType{kDLInt, sizeof(long) * 8, 4}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 2}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 4}, + DLDataType{kDLInt, 64, 2}, + DLDataType{kDLInt, 64, 4}, + DLDataType{kDLUInt, 64, 2}, + DLDataType{kDLUInt, 64, 4}, + DLDataType{kDLFloat, 32, 2}, + DLDataType{kDLFloat, 32, 4}, + DLDataType{kDLFloat, 64, 2}, + DLDataType{kDLFloat, 64, 4}}; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} +#endif // _CCCL_HAS_CTK() + +bool test_mdspan_to_dlpack_extended_fp_vector_types() +{ + using list_t = cuda::std::__type_list< +#if _CCCL_HAS_NVFP16() + ::__half2, +#endif +#if _CCCL_HAS_NVBF16() + ::__nv_bfloat162, +#endif +#if _CCCL_HAS_NVFP8_E4M3() + ::__nv_fp8x2_e4m3, + ::__nv_fp8x4_e4m3, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + ::__nv_fp8x2_e5m2, + ::__nv_fp8x4_e5m2, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + ::__nv_fp8x2_e8m0, + ::__nv_fp8x4_e8m0, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + ::__nv_fp6x2_e2m3, + ::__nv_fp6x4_e2m3, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + ::__nv_fp6x2_e3m2, + ::__nv_fp6x4_e3m2, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + ::__nv_fp4x2_e2m1, + ::__nv_fp4x4_e2m1, +#endif + void* /* dummy to allow trailing commas */>; + cuda::std::array expected_types = { +#if _CCCL_HAS_NVFP16() + DLDataType{kDLFloat, 16, 2}, +#endif +#if _CCCL_HAS_NVBF16() + DLDataType{kDLBfloat, 16, 2}, +#endif +#if _CCCL_HAS_NVFP8_E4M3() + DLDataType{kDLFloat8_e4m3fn, 8, 2}, + DLDataType{kDLFloat8_e4m3fn, 8, 4}, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + DLDataType{kDLFloat8_e5m2, 8, 2}, + DLDataType{kDLFloat8_e5m2, 8, 4}, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + DLDataType{kDLFloat8_e8m0fnu, 8, 2}, + DLDataType{kDLFloat8_e8m0fnu, 8, 4}, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + DLDataType{kDLFloat6_e2m3fn, 6, 2}, + DLDataType{kDLFloat6_e2m3fn, 6, 4}, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + DLDataType{kDLFloat6_e3m2fn, 6, 2}, + DLDataType{kDLFloat6_e3m2fn, 6, 4}, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + DLDataType{kDLFloat4_e2m1fn, 4, 2}, + DLDataType{kDLFloat4_e2m1fn, 4, 4}, +#endif + }; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_right());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_left());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_empty_size());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_rank_0());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_const_pointer());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_device());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_managed());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_basic_types());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_and_complex_types());)) +#if _CCCL_HAS_CTK() + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_vector_types());)) +#endif // _CCCL_HAS_CTK() + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_vector_types());)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.fail.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.fail.cpp new file mode 100644 index 00000000000..562d3757d07 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.fail.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include +#include + +bool test_mdspan_to_dlpack_wrapper_get_lvalue() +{ + auto tensor = cuda::__dlpack_tensor<3>{}.get(); + return true; +} + +int main(int, char**) +{ + assert(test_mdspan_to_dlpack_wrapper_get_lvalue()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp new file mode 100644 index 00000000000..ee7016f9556 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp @@ -0,0 +1,214 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes) +{ + assert(dt.code == code); + assert(dt.bits == bits); + assert(dt.lanes == lanes); +} + +bool test_mdspan_to_dlpack_wrapper_default_ctor() +{ + cuda::__dlpack_tensor<3> dlpack_wrapper{}; + DLDataType default_dtype = {}; + DLDevice default_device = {}; + auto& tensor = dlpack_wrapper.get(); + assert(tensor.device.device_type == default_device.device_type); + assert(tensor.device.device_id == default_device.device_id); + check_datatype(tensor.dtype, default_dtype.code, default_dtype.bits, default_dtype.lanes); + assert(tensor.shape != nullptr); + assert(tensor.strides != nullptr); + return true; +} + +bool test_dlpack_wrapper_copy_ctor() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto w = cuda::to_dlpack_tensor(md); + auto& t = w.get(); + auto* shape_ptr = t.shape; + auto* strides_ptr = t.strides; + + auto w2 = w; // copy construct + // Copy must not alias the source wrapper's shape/stride storage. + auto& t2 = w2.get(); + assert(t2.shape != nullptr); + assert(t2.strides != nullptr); + assert(t2.shape != shape_ptr); + assert(t2.strides != strides_ptr); + + // Source wrapper must remain intact. + assert(t.shape == shape_ptr); + assert(t.strides == strides_ptr); + + // Sanity-check copied tensor metadata and values. + assert(t2.device.device_type == kDLCPU); + assert(t2.device.device_id == 0); + assert(t2.ndim == 2); + check_datatype(t2.dtype, kDLInt, 32, 1); + assert(t2.shape[0] == 2); + assert(t2.shape[1] == 3); + assert(t2.strides[0] == 3); + assert(t2.strides[1] == 1); + assert(t2.byte_offset == 0); + assert(t2.data == data); + return true; +} + +bool test_dlpack_wrapper_move_ctor() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto w = cuda::to_dlpack_tensor(md); + auto& t = w.get(); + auto* shape_ptr = t.shape; + auto* strides_ptr = t.strides; + auto moved = cuda::std::move(w); // move construct + + // Moved-to wrapper must not keep pointers to moved-from storage. + auto& tm = moved.get(); + assert(tm.shape != nullptr); + assert(tm.strides != nullptr); + assert(tm.shape != shape_ptr); + assert(tm.strides != strides_ptr); + + // Moved-from wrapper is explicitly reset to a default/empty DLTensor. + assert(t.shape == nullptr); + assert(t.strides == nullptr); + assert(t.data == nullptr); + assert(t.ndim == 0); + + // Sanity-check moved-to tensor metadata and values. + assert(tm.device.device_type == kDLCPU); + assert(tm.device.device_id == 0); + assert(tm.ndim == 2); + check_datatype(tm.dtype, kDLInt, 32, 1); + assert(tm.shape[0] == 2); + assert(tm.shape[1] == 3); + assert(tm.strides[0] == 3); + assert(tm.strides[1] == 1); + assert(tm.byte_offset == 0); + assert(tm.data == data); + return true; +} + +bool test_dlpack_wrapper_copy_assignment() +{ + using extents_t = cuda::std::extents; + int data_a[6] = {0, 1, 2, 3, 4, 5}; + int data_b[6] = {6, 7, 8, 9, 10, 11}; + cuda::host_mdspan md_a{data_a, extents_t{}}; + cuda::host_mdspan md_b{data_b, extents_t{}}; + auto a = cuda::to_dlpack_tensor(md_a); + auto b = cuda::to_dlpack_tensor(md_b); + auto& ta = a.get(); + auto& tb = b.get(); + auto* b_shape_ptr = tb.shape; + auto* b_strides_ptr = tb.strides; + + b = a; // copy assign + // Destination must keep pointing to its own member arrays (not to `a`). + assert(tb.shape == b_shape_ptr); + assert(tb.strides == b_strides_ptr); + assert(tb.shape != ta.shape); + assert(tb.strides != ta.strides); + + // Values must be copied correctly. + assert(tb.data == data_a); + assert(tb.ndim == 2); + assert(tb.shape[0] == 2); + assert(tb.shape[1] == 3); + assert(tb.strides[0] == 3); + assert(tb.strides[1] == 1); + return true; +} + +bool test_dlpack_wrapper_move_assignment() +{ + using extents_t = cuda::std::extents; + int data_a[6] = {0, 1, 2, 3, 4, 5}; + int data_b[6] = {6, 7, 8, 9, 10, 11}; + cuda::host_mdspan md_a{data_a, extents_t{}}; + cuda::host_mdspan md_b{data_b, extents_t{}}; + auto a = cuda::to_dlpack_tensor(md_a); + auto b = cuda::to_dlpack_tensor(md_b); + auto& ta = a.get(); + auto& tb = b.get(); + auto* a_shape_ptr = ta.shape; + auto* a_strides_ptr = ta.strides; + auto* b_shape_ptr = tb.shape; + auto* b_strides_ptr = tb.strides; + + b = cuda::std::move(a); // move assign + // Destination must keep pointing to its own member arrays, not the source's. + assert(tb.shape == b_shape_ptr); + assert(tb.strides == b_strides_ptr); + assert(tb.shape != a_shape_ptr); + assert(tb.strides != a_strides_ptr); + + // Source must be reset. + assert(ta.shape == nullptr); + assert(ta.strides == nullptr); + assert(ta.data == nullptr); + assert(ta.ndim == 0); + + // Values must be moved correctly. + assert(tb.data == data_a); + assert(tb.ndim == 2); + assert(tb.shape[0] == 2); + assert(tb.shape[1] == 3); + assert(tb.strides[0] == 3); + assert(tb.strides[1] == 1); + return true; +} + +bool test_dlpack_wrapper_get() +{ + using wrapper_t = cuda::__dlpack_tensor<2>; + static_assert(cuda::std::is_same_v().get()), ::DLTensor&>); + static_assert(cuda::std::is_same_v().get()), const ::DLTensor&>); + + wrapper_t w{}; + // Mutating through the reference returned by `get()` must be observable. + auto& t = w.get(); + t.ndim = 123; + assert(w.get().ndim == 123); + + // Const overload should also alias the same underlying object. + const wrapper_t& cw = w; + assert(&cw.get() == &w.get()); + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + (assert(test_mdspan_to_dlpack_wrapper_default_ctor()); assert(test_dlpack_wrapper_copy_ctor()); + assert(test_dlpack_wrapper_move_ctor()); + assert(test_dlpack_wrapper_copy_assignment()); + assert(test_dlpack_wrapper_move_assignment()); + assert(test_dlpack_wrapper_get());)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp index 406b6157cbf..1561857be02 100644 --- a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp @@ -29,31 +29,61 @@ __host__ __device__ void test() test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); @@ -63,6 +93,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -72,6 +112,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -81,6 +131,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -90,11 +150,26 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); @@ -104,11 +179,26 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + + static_assert(cuda::__is_vector_type_v); + // 2. Test invalid combinations test(); test(); test(); + + static_assert(!cuda::__is_vector_type_v); + static_assert(!cuda::__is_vector_type_v); } int main(int, char**)