diff --git a/docs/libcudacxx/extended_api/mdspan.rst b/docs/libcudacxx/extended_api/mdspan.rst
index 028ac918aec..f0b29f3d1d0 100644
--- a/docs/libcudacxx/extended_api/mdspan.rst
+++ b/docs/libcudacxx/extended_api/mdspan.rst
@@ -10,6 +10,8 @@ Mdspan
    mdspan/host_device_accessor
    mdspan/restrict_accessor
    mdspan/shared_memory_accessor
+   mdspan/mdspan_to_dlpack
+   mdspan/dlpack_to_mdspan
 
 .. list-table::
    :widths: 25 45 30 30
@@ -34,3 +36,13 @@ Mdspan
      - ``mdspan`` and accessor for CUDA shared memory
      - CCCL 3.2.0
      - CUDA 13.2
+
+   * - :ref:`mdspan to dlpack <libcudacxx-extended-api-mdspan-mdspan-to-dlpack>`
+     - Convert a ``mdspan`` to a ``DLTensor``
+     - CCCL 3.2.0
+     - CUDA 13.2
+
+   * - :ref:`dlpack to mdspan <libcudacxx-extended-api-mdspan-dlpack-to-mdspan>`
+     - Convert a ``DLTensor`` to a ``mdspan``
+     - CCCL 3.2.0
+     - CUDA 13.2
diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst
new file mode 100644
index 00000000000..61497f6621d
--- /dev/null
+++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst
@@ -0,0 +1,129 @@
+.. _libcudacxx-extended-api-mdspan-dlpack-to-mdspan:
+
+DLPack to ``mdspan``
+====================
+
+This functionality provides a conversion from `DLPack <https://dmlc.github.io/dlpack/latest/>`__ ``DLTensor`` to ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan``.
+
+Defined in the ``<cuda/mdspan>`` header.
+
+Conversion functions
+--------------------
+
+.. code:: cuda
+
+   namespace cuda {
+
+   template <typename ElementType, size_t Rank, typename LayoutPolicy = cuda::std::layout_stride>
+   [[nodiscard]] cuda::host_mdspan<ElementType, cuda::std::dims<Rank, int64_t>, LayoutPolicy>
+   to_host_mdspan(const DLTensor& tensor);
+
+   template <typename ElementType, size_t Rank, typename LayoutPolicy = cuda::std::layout_stride>
+   [[nodiscard]] cuda::device_mdspan<ElementType, cuda::std::dims<Rank, int64_t>, LayoutPolicy>
+   to_device_mdspan(const DLTensor& tensor);
+
+   template <typename ElementType, size_t Rank, typename LayoutPolicy = cuda::std::layout_stride>
+   [[nodiscard]] cuda::managed_mdspan<ElementType, cuda::std::dims<Rank, int64_t>, LayoutPolicy>
+   to_managed_mdspan(const DLTensor& tensor);
+
+   } // namespace cuda
+
+Template parameters
+-------------------
+
+- ``ElementType``: The element type of the resulting ``mdspan``. Must match the ``DLTensor::dtype``.
+- ``Rank``: The number of dimensions. Must match ``DLTensor::ndim``.
+- ``LayoutPolicy``: The layout policy for the resulting ``mdspan``. Defaults to ``cuda::std::layout_stride``. Supported layouts are:
+
+  - ``cuda::std::layout_right`` (C-contiguous, row-major)
+  - ``cuda::std::layout_left`` (Fortran-contiguous, column-major)
+  - ``cuda::std::layout_stride`` (general strided layout)
+
+Semantics
+---------
+
+The conversion produces a non-owning ``mdspan`` view of the ``DLTensor`` data:
+
+- The ``mdspan`` data pointer is computed as ``static_cast<char*>(tensor.data) + tensor.byte_offset``.
+- For ``rank > 0``, ``mdspan.extent(i)`` is ``tensor.shape[i]``.
+- For ``layout_stride``, ``mdspan.stride(i)`` is ``tensor.strides[i]`` (or computed as row-major if ``strides`` is ``nullptr`` for DLPack < v1.2).
+- The device type is validated:
+
+  - ``kDLCPU`` for ``to_host_mdspan``
+  - ``kDLCUDA`` for ``to_device_mdspan``
+  - ``kDLCUDAManaged`` for ``to_managed_mdspan``
+
+Constraints
+-----------
+
+- ``LayoutPolicy`` must be one of ``cuda::std::layout_right``, ``cuda::std::layout_left``, or ``cuda::std::layout_stride``.
+- For ``layout_right`` and ``layout_left``, the ``DLTensor`` strides must be compatible with the layout.
+
+Runtime errors
+--------------
+
+The conversion throws ``std::invalid_argument`` in the following cases:
+
+- ``DLTensor::ndim`` does not match the specified ``Rank``.
+- ``DLTensor::dtype`` does not match ``ElementType``.
+- ``DLTensor::data`` is ``nullptr``.
+- ``DLTensor::shape`` is ``nullptr`` (for rank > 0).
+- Any ``DLTensor::shape[i]`` is negative.
+- ``DLTensor::strides`` is ``nullptr`` for DLPack v1.2 or later.
+- ``DLTensor::strides`` is ``nullptr`` for ``layout_left`` with rank > 1 (DLPack < v1.2).
+- ``DLTensor::strides[i]`` is not positive for ``layout_stride``.
+- ``DLTensor::strides`` are not compatible with the requested ``layout_right`` or ``layout_left``.
+- ``DLTensor::device.device_type`` does not match the target mdspan type.
+- Data pointer is not properly aligned for the element type.
+
+Availability notes
+------------------
+
+- This API is available only when DLPack header is present, namely ``<dlpack/dlpack.h>`` is found in the include path.
+- Requires DLPack major version 1.
+
+References
+----------
+
+- `DLPack C API <https://dmlc.github.io/dlpack/latest/c_api.html>`__ documentation.
+
+Example
+-------
+
+.. code:: cuda
+
+  #include <dlpack/dlpack.h>
+  #include <cuda/mdspan>
+  #include <cuda/std/cassert>
+  #include <cuda/std/cstdint>
+
+  int main() {
+    int data[6] = {0, 1, 2, 3, 4, 5};
+
+    // Create a DLTensor manually for demonstration
+    int64_t shape[2]   = {2, 3};
+    int64_t strides[2] = {3, 1};  // row-major strides
+
+    DLTensor tensor{};
+    tensor.data        = data;
+    tensor.device      = {kDLCPU, 0};
+    tensor.ndim        = 2;
+    tensor.dtype       = {kDLInt, 32, 1};
+    tensor.shape       = shape;
+    tensor.strides     = strides;
+    tensor.byte_offset = 0;
+
+    // Convert to host_mdspan
+    auto md = cuda::to_host_mdspan<int, 2>(tensor);
+
+    assert(md.rank() == 2);
+    assert(md.extent(0) == 2 && md.extent(1) == 3);
+    assert(md.stride(0) == 3 && md.stride(1) == 1);
+    assert(md.data_handle() == data);
+    assert(md(0, 0) == 0 && md(1, 2) == 5);
+  }
+
+See also
+--------
+
+- :ref:`libcudacxx-extended-api-mdspan-mdspan-to-dlpack` for the reverse conversion.
diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst
new file mode 100644
index 00000000000..bb04aae5002
--- /dev/null
+++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst
@@ -0,0 +1,137 @@
+.. _libcudacxx-extended-api-mdspan-mdspan-to-dlpack:
+
+``mdspan`` to DLPack
+====================
+
+This functionality provides a conversion from ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan`` to `DLPack <https://dmlc.github.io/dlpack/latest/>`__ ``DLTensor`` view.
+
+Defined in the ``<cuda/mdspan>`` header.
+
+Conversion functions
+--------------------
+
+.. code:: cuda
+
+   namespace cuda {
+
+   template <typename T, typename Extents, typename Layout, typename Accessor>
+   [[nodiscard]] __dlpack_tensor<Extents::rank()>
+   to_dlpack(const cuda::host_mdspan<T, Extents, Layout, Accessor>& mdspan);
+
+   template <typename T, typename Extents, typename Layout, typename Accessor>
+   [[nodiscard]] __dlpack_tensor<Extents::rank()>
+   to_dlpack(const cuda::device_mdspan<T, Extents, Layout, Accessor>& mdspan,
+                    cuda::device_ref device = cuda::device_ref{0});
+
+   template <typename T, typename Extents, typename Layout, typename Accessor>
+   [[nodiscard]] __dlpack_tensor<Extents::rank()>
+   to_dlpack(const cuda::managed_mdspan<T, Extents, Layout, Accessor>& mdspan);
+
+   } // namespace cuda
+
+Types
+-----
+
+``__dlpack_tensor`` is an internal class that stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation.
+
+.. code:: cuda
+
+  namespace cuda {
+
+  template <size_t Rank>
+  class __dlpack_tensor {
+  public:
+      __dlpack_tensor();
+      __dlpack_tensor(const __dlpack_tensor&) noexcept;
+      __dlpack_tensor(__dlpack_tensor&&) noexcept;
+      __dlpack_tensor& operator=(const __dlpack_tensor&) noexcept;
+      __dlpack_tensor& operator=(__dlpack_tensor&&) noexcept;
+      ~__dlpack_tensor() noexcept = default;
+
+      DLTensor&       get() noexcept;
+      const DLTensor& get() const noexcept;
+  };
+
+  } // namespace cuda
+
+``cuda::__dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation.
+
+.. note:: **Lifetime**
+
+  The ``DLTensor`` associated with ``cuda::__dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle.
+
+.. note:: **Const-correctness**
+
+  ``DLTensor::data`` points at ``mdspan.data_handle()`` (or is ``nullptr`` if ``mdspan.size() == 0``). If ``T`` is ``const``, the pointer is ``const_cast``'d because ``DLTensor::data`` is unqualified.
+
+Semantics
+---------
+
+The conversion produces a non-owning DLPack view of the ``mdspan`` data and metadata:
+
+- ``DLTensor::ndim`` is ``mdspan.rank()``.
+- For rank > 0, ``DLTensor::shape[i]`` is ``mdspan.extent(i)``.
+- For rank > 0, ``DLTensor::strides[i]`` is ``mdspan.stride(i)``.
+- ``DLTensor::byte_offset`` is always ``0``.
+- ``DLTensor::device`` is:
+
+  - ``{kDLCPU, 0}`` for ``cuda::host_mdspan``
+  - ``{kDLCUDA, device.get()}`` for ``cuda::device_mdspan``
+  - ``{kDLCUDAManaged, 0}`` for ``cuda::managed_mdspan``
+
+Element types are mapped to ``DLDataType`` according to the DLPack conventions, including:
+
+- ``bool``.
+- Signed and unsigned integers.
+- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, ``__float128``, FP8, FP6, FP4 when available.
+- Complex: ``cuda::std::complex<__half>``, ``cuda::std::complex<float>``, and ``cuda::std::complex<double>``.
+- `CUDA built-in vector types <https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#built-in-types>`__, such as ``int2``, ``float4``, etc.
+- Vector types for extended floating-point, such as ``__half2``, ``__nv_fp8x4_e4m3``, etc.
+
+Constraints
+-----------
+
+- The accessor ``data_handle_type`` must be a pointer type.
+
+Runtime errors
+--------------
+
+- If any ``extent(i)`` or ``stride(i)`` cannot be represented in ``int64_t``, the conversion raises an exception.
+
+Availability notes
+------------------
+
+- This API is available only when DLPack header is present, namely ``<dlpack/dlpack.h>`` is found in the include path.
+
+References
+----------
+
+- `DLPack C API <https://dmlc.github.io/dlpack/latest/c_api.html>`__ documentation.
+
+Example
+-------
+
+.. code:: cuda
+
+  #include <dlpack/dlpack.h>
+  #include <cuda/mdspan>
+  #include <cuda/std/cassert>
+  #include <cuda/std/cstdint>
+
+  int main() {
+    using extents_t = cuda::std::extents<size_t, 2, 3>;
+
+    int data[6] = {0, 1, 2, 3, 4, 5};
+    cuda::host_mdspan<int, extents_t> md{data, extents_t{}};
+
+    auto dl              = cuda::to_dlpack(md);
+    const auto& dltensor = dl.get();
+    // auto dltensor = dl.get(); is incorrect; it returns a reference to a temporary object that will be destroyed at the end of the statement.
+
+    // `dl` owns the shape/stride storage; `dltensor.data` is a non-owning pointer to `data`.
+    assert(dltensor.device.device_type == kDLCPU);
+    assert(dltensor.ndim == 2);
+    assert(dltensor.shape[0] == 2 && dltensor.shape[1] == 3);
+    assert(dltensor.strides[0] == 3 && dltensor.strides[1] == 1);
+    assert(dltensor.data == data);
+  }
diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h
new file mode 100644
index 00000000000..b2dc3640c6b
--- /dev/null
+++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h
@@ -0,0 +1,261 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H
+#define _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_DLPACK()
+
+#  include <cuda/__internal/dlpack.h>
+#  include <cuda/__mdspan/host_device_mdspan.h>
+#  include <cuda/__mdspan/mdspan_to_dlpack.h>
+#  include <cuda/__memory/is_aligned.h>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/__utility/cmp.h>
+#  include <cuda/std/array>
+#  include <cuda/std/cstdint>
+#  include <cuda/std/mdspan>
+
+#  include <stdexcept>
+
+#  include <dlpack/dlpack.h>
+//
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+template <typename _ElementType>
+[[nodiscard]] _CCCL_HOST_API inline bool __validate_dlpack_data_type(const ::DLDataType& __dtype) noexcept
+{
+  const auto __expected = ::cuda::__data_type_to_dlpack<_ElementType>();
+  return __dtype.code == __expected.code && __dtype.bits == __expected.bits && __dtype.lanes == __expected.lanes;
+}
+
+[[nodiscard]]
+_CCCL_HOST_API inline ::cuda::std::int64_t __layout_right_stride(
+  const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos, ::cuda::std::size_t __rank) noexcept
+{
+  ::cuda::std::int64_t __stride = 1;
+  for (auto __i = __pos + 1; __i < __rank; ++__i)
+  {
+    __stride *= __shapes[__i]; // TODO: check for overflow
+  }
+  return __stride;
+}
+
+[[nodiscard]]
+_CCCL_HOST_API inline ::cuda::std::int64_t
+__layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos) noexcept
+{
+  ::cuda::std::int64_t __stride = 1;
+  for (::cuda::std::size_t __i = 0; __i < __pos; ++__i)
+  {
+    __stride *= __shapes[__i]; // TODO: check for overflow
+  }
+  return __stride;
+}
+
+template <typename _LayoutPolicy>
+_CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[maybe_unused]] ::cuda::std::size_t __rank)
+{
+  [[maybe_unused]] constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>;
+  [[maybe_unused]] constexpr bool __is_layout_left  = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>;
+  [[maybe_unused]] constexpr bool __is_layout_stride =
+    ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>;
+  const auto __strides_ptr = __tensor.strides;
+  if (__strides_ptr == nullptr)
+  {
+#  if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)
+    _CCCL_THROW(::std::invalid_argument{"strides=nullptr is not supported for DLPack v1.2 and later"});
+#  else
+    // strides == nullptr means row-major (C-contiguous) layout
+    if (__is_layout_left && __rank > 1)
+    {
+      _CCCL_THROW(::std::invalid_argument{"strides must be non-null for layout_left"});
+    }
+    else
+    {
+      return;
+    }
+#  endif // DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)
+  }
+  for (::cuda::std::size_t __pos = 0; __pos < __rank; ++__pos)
+  {
+    if constexpr (__is_layout_right)
+    {
+      if (__strides_ptr[__pos] != ::cuda::__layout_right_stride(__tensor.shape, __pos, __rank))
+      {
+        _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_right"});
+      }
+    }
+    else if constexpr (__is_layout_left)
+    {
+      if (__strides_ptr[__pos] != ::cuda::__layout_left_stride(__tensor.shape, __pos))
+      {
+        _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_left"});
+      }
+    }
+    else if constexpr (__is_layout_stride)
+    {
+      if (__strides_ptr[__pos] <= 0)
+      {
+        _CCCL_THROW(::std::invalid_argument{"mdspan strides must be positive"});
+      }
+    }
+  }
+}
+
+template <typename _ElementType, ::cuda::std::size_t _Rank, typename _LayoutPolicy>
+[[nodiscard]]
+_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy>
+__to_mdspan(const ::DLTensor& __tensor)
+{
+  using __extents_type              = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>;
+  using __mdspan_type               = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>;
+  using __mapping_type              = typename _LayoutPolicy::template mapping<__extents_type>;
+  using __element_type              = typename __mdspan_type::element_type;
+  constexpr bool __is_layout_right  = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>;
+  constexpr bool __is_layout_left   = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>;
+  constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>;
+  // TODO: add support for layout_right_padded and layout_left_padded
+  if constexpr (!__is_layout_right && !__is_layout_left && !__is_layout_stride)
+  {
+    static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy");
+    _CCCL_UNREACHABLE();
+    return __mdspan_type{};
+  }
+  else
+  {
+    if (cuda::std::cmp_not_equal(__tensor.ndim, _Rank))
+    {
+      _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"});
+    }
+    if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype))
+    {
+      _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"});
+    }
+    if (__tensor.data == nullptr)
+    {
+      _CCCL_THROW(::std::invalid_argument{"DLTensor data must be non-null"});
+    }
+    auto __base_data           = static_cast<char*>(__tensor.data) + __tensor.byte_offset;
+    auto __data                = reinterpret_cast<__element_type*>(__base_data);
+    const auto __datatype_size = __tensor.dtype.bits * __tensor.dtype.lanes / 8;
+    // this is not the exact solution because data type size != data type alignment.
+    // However, it always works for the supported data types.
+    if (__datatype_size > 0 && !::cuda::is_aligned(__data, __datatype_size))
+    {
+      _CCCL_THROW(::std::invalid_argument{"DLTensor data must be aligned to the data type"});
+    }
+    if constexpr (_Rank == 0)
+    {
+      return __mdspan_type{__data, __mapping_type{}};
+    }
+    else // Rank > 0
+    {
+      if (__tensor.shape == nullptr)
+      {
+        _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be non-null"});
+      }
+      using ::cuda::std::int64_t;
+      using ::cuda::std::size_t;
+      ::cuda::std::array<int64_t, _Rank> __extents_array{};
+      for (size_t __i = 0; __i < _Rank; ++__i)
+      {
+        if (__tensor.shape[__i] < 0)
+        {
+          _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be positive"});
+        }
+        __extents_array[__i] = __tensor.shape[__i];
+      }
+      ::cuda::__validate_dlpack_strides<_LayoutPolicy>(__tensor, _Rank);
+      if constexpr (__is_layout_stride)
+      {
+        ::cuda::std::array<int64_t, _Rank> __strides_array{};
+        for (size_t __i = 0; __i < _Rank; ++__i)
+        {
+          const bool __has_strides = __tensor.strides != nullptr;
+          __strides_array[__i] =
+            __has_strides ? __tensor.strides[__i] : ::cuda::__layout_right_stride(__tensor.shape, __i, _Rank);
+        }
+        return __mdspan_type{__data, __mapping_type{__extents_array, __strides_array}};
+      }
+      else
+      {
+        __extents_type __extents{__extents_array};
+        return __mdspan_type{__data, __extents};
+      }
+    }
+  }
+}
+
+/***********************************************************************************************************************
+ * Public API
+ **********************************************************************************************************************/
+
+template <typename _ElementType, ::cuda::std::size_t _Rank, typename _LayoutPolicy = ::cuda::std::layout_stride>
+[[nodiscard]]
+_CCCL_HOST_API ::cuda::host_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy>
+to_host_mdspan(const ::DLTensor& __tensor)
+{
+  if (__tensor.device.device_type != ::kDLCPU)
+  {
+    _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCPU for host_mdspan"});
+  }
+  using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>;
+  using __mdspan_type  = ::cuda::host_mdspan<_ElementType, __extents_type, _LayoutPolicy>;
+  return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)};
+}
+
+template <typename _ElementType, ::cuda::std::size_t _Rank, typename _LayoutPolicy = ::cuda::std::layout_stride>
+[[nodiscard]]
+_CCCL_HOST_API ::cuda::device_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy>
+to_device_mdspan(const ::DLTensor& __tensor)
+{
+  if (__tensor.device.device_type != ::kDLCUDA)
+  {
+    _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDA for device_mdspan"});
+  }
+  using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>;
+  using __mdspan_type  = ::cuda::device_mdspan<_ElementType, __extents_type, _LayoutPolicy>;
+  return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)};
+}
+
+template <typename _ElementType, ::cuda::std::size_t _Rank, typename _LayoutPolicy = ::cuda::std::layout_stride>
+[[nodiscard]]
+_CCCL_HOST_API ::cuda::managed_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy>
+to_managed_mdspan(const ::DLTensor& __tensor)
+{
+  if (__tensor.device.device_type != ::kDLCUDAManaged)
+  {
+    _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDAManaged for managed_mdspan"});
+  }
+  using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>;
+  using __mdspan_type  = ::cuda::managed_mdspan<_ElementType, __extents_type, _LayoutPolicy>;
+  return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)};
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // __CCCL_HAS_DLPACK()
+#endif // _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H
diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h
new file mode 100644
index 00000000000..b5bdba1c80a
--- /dev/null
+++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h
@@ -0,0 +1,303 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H
+#define _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE(<dlpack/dlpack.h>)
+
+#  include <cuda/__mdspan/host_device_mdspan.h>
+#  include <cuda/__type_traits/is_floating_point.h>
+#  include <cuda/__type_traits/vector_type.h>
+#  include <cuda/devices>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__fwd/complex.h>
+#  include <cuda/std/__limits/numeric_limits.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_pointer.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/__type_traits/num_bits.h>
+#  include <cuda/std/__type_traits/remove_cv.h>
+#  include <cuda/std/__utility/cmp.h>
+#  include <cuda/std/__utility/move.h>
+#  include <cuda/std/array>
+#  include <cuda/std/cstdint>
+#  include <cuda/std/mdspan>
+
+#  include <stdexcept>
+
+#  include <dlpack/dlpack.h>
+//
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1");
+
+template <typename _ElementType>
+[[nodiscard]] _CCCL_HOST_API inline ::DLDataType __data_type_to_dlpack() noexcept
+{
+  if constexpr (::cuda::std::is_same_v<_ElementType, bool>)
+  {
+    return ::DLDataType{::kDLBool, 8, 1};
+  }
+  //--------------------------------------------------------------------------------------------------------------------
+  // Signed integer types
+  else if constexpr (::cuda::std::__cccl_is_integer_v<_ElementType>)
+  {
+    return ::DLDataType{
+      (::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1};
+  }
+  //--------------------------------------------------------------------------------------------------------------------
+  // bfloat16 (must come before general floating-point)
+#  if _CCCL_HAS_NVBF16()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_bfloat16>)
+  {
+    return ::DLDataType{::kDLBfloat, 16, 1};
+  }
+#  endif // _CCCL_HAS_NVBF16()
+  //--------------------------------------------------------------------------------------------------------------------
+  // Low-precision Floating-point types (must come before general floating-point)
+#  if _CCCL_HAS_NVFP8_E4M3()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>)
+  {
+    return ::DLDataType{::kDLFloat8_e4m3fn, 8, 1};
+  }
+#  endif // _CCCL_HAS_NVFP8_E4M3()
+#  if _CCCL_HAS_NVFP8_E5M2()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e5m2>)
+  {
+    return ::DLDataType{::kDLFloat8_e5m2, 8, 1};
+  }
+#  endif // _CCCL_HAS_NVFP8_E5M2()
+#  if _CCCL_HAS_NVFP8_E8M0()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e8m0>)
+  {
+    return ::DLDataType{::kDLFloat8_e8m0fnu, 8, 1};
+  }
+#  endif // _CCCL_HAS_NVFP8_E8M0()
+#  if _CCCL_HAS_NVFP6_E2M3()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e2m3>)
+  {
+    return ::DLDataType{::kDLFloat6_e2m3fn, 6, 1};
+  }
+#  endif // _CCCL_HAS_NVFP6_E2M3()
+#  if _CCCL_HAS_NVFP6_E3M2()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e3m2>)
+  {
+    return ::DLDataType{::kDLFloat6_e3m2fn, 6, 1};
+  }
+#  endif // _CCCL_HAS_NVFP6_E3M2()
+#  if _CCCL_HAS_NVFP4_E2M1()
+  else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp4_e2m1>)
+  {
+    return ::DLDataType{::kDLFloat4_e2m1fn, 4, 1};
+  }
+#  endif // _CCCL_HAS_NVFP4_E2M1()
+  //--------------------------------------------------------------------------------------------------------------------
+  // Floating-point types (after specific types)
+  else if constexpr (::cuda::is_floating_point_v<_ElementType>)
+  {
+    return ::DLDataType{::kDLFloat, ::cuda::std::__num_bits_v<_ElementType>, 1};
+  }
+  //--------------------------------------------------------------------------------------------------------------------
+  // Complex types
+  // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128>
+  else if constexpr (::cuda::std::__is_cuda_std_complex_v<_ElementType> && sizeof(_ElementType) <= sizeof(double) * 2)
+  {
+    // DLPack encodes complex numbers as a compact struct of two scalar values, and `bits` stores
+    // the size of the full complex number (e.g. std::complex<float> => bits=64).
+    return ::DLDataType{::kDLComplex, sizeof(_ElementType) * CHAR_BIT, 1};
+  }
+  //--------------------------------------------------------------------------------------------------------------------
+  // CUDA built-in vector types
+#  if _CCCL_HAS_CTK()
+  else if constexpr (::cuda::__is_vector_type_v<_ElementType> || ::cuda::__is_extended_fp_vector_type_v<_ElementType>)
+  {
+    constexpr ::cuda::std::uint16_t __lanes = ::cuda::std::tuple_size_v<_ElementType>;
+    if constexpr (__lanes == 2 || __lanes == 4)
+    {
+      using __scalar_t = ::cuda::std::remove_cv_t<::cuda::std::tuple_element_t<0, _ElementType>>;
+      auto __scalar    = ::cuda::__data_type_to_dlpack<__scalar_t>();
+      __scalar.lanes   = __lanes;
+      return __scalar;
+    }
+    else
+    {
+      static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported vector type");
+      return ::DLDataType{};
+    }
+  }
+#  endif // _CCCL_HAS_CTK()
+  //--------------------------------------------------------------------------------------------------------------------
+  // Unsupported types
+  else
+  {
+    static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type");
+    return ::DLDataType{};
+  }
+}
+
+template <::cuda::std::size_t _Rank>
+class __dlpack_tensor
+{
+  ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{};
+  ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{};
+  ::DLTensor __tensor{};
+
+  _CCCL_HOST_API void __update_tensor() noexcept
+  {
+    __tensor.shape   = _Rank > 0 ? __shape.data() : nullptr;
+    __tensor.strides = _Rank > 0 ? __strides.data() : nullptr;
+  }
+
+public:
+  _CCCL_HOST_API explicit __dlpack_tensor() noexcept
+  {
+    __update_tensor();
+  }
+
+  _CCCL_HOST_API __dlpack_tensor(const __dlpack_tensor& __other) noexcept
+      : __shape{__other.__shape}
+      , __strides{__other.__strides}
+      , __tensor{__other.__tensor}
+  {
+    __update_tensor();
+  }
+
+  _CCCL_HOST_API __dlpack_tensor(__dlpack_tensor&& __other) noexcept
+      : __shape{::cuda::std::move(__other.__shape)}
+      , __strides{::cuda::std::move(__other.__strides)}
+      , __tensor{__other.__tensor}
+  {
+    __other.__tensor = ::DLTensor{};
+    __update_tensor();
+  }
+
+  _CCCL_HOST_API __dlpack_tensor& operator=(const __dlpack_tensor& __other) noexcept
+  {
+    if (this == &__other)
+    {
+      return *this;
+    }
+    __shape   = __other.__shape;
+    __strides = __other.__strides;
+    __tensor  = __other.__tensor;
+    __update_tensor();
+    return *this;
+  }
+
+  _CCCL_HOST_API __dlpack_tensor& operator=(__dlpack_tensor&& __other) noexcept
+  {
+    if (this == &__other)
+    {
+      return *this;
+    }
+    __shape          = ::cuda::std::move(__other.__shape);
+    __strides        = ::cuda::std::move(__other.__strides);
+    __tensor         = __other.__tensor;
+    __other.__tensor = ::DLTensor{};
+    __update_tensor();
+    return *this;
+  }
+
+  _CCCL_HIDE_FROM_ABI ~__dlpack_tensor() noexcept = default;
+
+  [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() noexcept
+  {
+    return __tensor;
+  }
+
+  [[nodiscard]] _CCCL_HOST_API const ::DLTensor& get() const noexcept
+  {
+    return __tensor;
+  }
+};
+
+template <typename _ElementType, typename _Extents, typename _Layout, typename _Accessor>
+[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()>
+__to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan,
+            ::DLDeviceType __device_type,
+            int __device_id)
+{
+  static_assert(::cuda::std::is_pointer_v<typename _Accessor::data_handle_type>, "data_handle_type must be a pointer");
+  using __element_type = ::cuda::std::remove_cv_t<_ElementType>;
+  __dlpack_tensor<_Extents::rank()> __wrapper{};
+  auto& __tensor  = __wrapper.get();
+  __tensor.data   = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr;
+  __tensor.device = ::DLDevice{__device_type, __device_id};
+  __tensor.ndim   = static_cast<int>(__mdspan.rank());
+  __tensor.dtype  = ::cuda::__data_type_to_dlpack<::cuda::std::remove_cv_t<_ElementType>>();
+  if constexpr (_Extents::rank() > 0)
+  {
+    constexpr auto __max_extent = ::cuda::std::numeric_limits<::cuda::std::int64_t>::max();
+    for (::cuda::std::size_t __i = 0; __i < __mdspan.rank(); ++__i)
+    {
+      if (::cuda::std::cmp_greater(__mdspan.extent(__i), __max_extent))
+      {
+        _CCCL_THROW(::std::invalid_argument{"Extent is too large"});
+      }
+      if (::cuda::std::cmp_greater(__mdspan.stride(__i), __max_extent))
+      {
+        _CCCL_THROW(::std::invalid_argument{"Stride is too large"});
+      }
+      __tensor.shape[__i]   = static_cast<::cuda::std::int64_t>(__mdspan.extent(__i));
+      __tensor.strides[__i] = static_cast<::cuda::std::int64_t>(__mdspan.stride(__i));
+    }
+  }
+  __tensor.byte_offset = 0;
+  return __wrapper;
+}
+
+/***********************************************************************************************************************
+ * Public API
+ **********************************************************************************************************************/
+
+template <typename _ElementType, typename _Extents, typename _Layout, typename _Accessor>
+[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()>
+to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan)
+{
+  using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>;
+  return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0);
+}
+
+template <typename _ElementType, typename _Extents, typename _Layout, typename _Accessor>
+[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()>
+to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan,
+          ::cuda::device_ref __device = ::cuda::device_ref{0})
+{
+  using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>;
+  return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __device.get());
+}
+
+template <typename _ElementType, typename _Extents, typename _Layout, typename _Accessor>
+[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()>
+to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan)
+{
+  using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>;
+  return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0);
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE(<dlpack/dlpack.h>)
+#endif // _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H
diff --git a/libcudacxx/include/cuda/__type_traits/vector_type.h b/libcudacxx/include/cuda/__type_traits/vector_type.h
index 7e21b8a806e..4e88c9efa65 100644
--- a/libcudacxx/include/cuda/__type_traits/vector_type.h
+++ b/libcudacxx/include/cuda/__type_traits/vector_type.h
@@ -24,7 +24,6 @@
 #if _CCCL_HAS_CTK()
 
 #  include <cuda/std/__cstddef/types.h>
-#  include <cuda/std/__type_traits/always_false.h>
 #  include <cuda/std/__type_traits/is_same.h>
 
 #  if !_CCCL_CUDA_COMPILATION()
@@ -346,10 +345,195 @@ using __vector_type_t = decltype(::cuda::__cccl_vector_type_t_impl<_Tp, _Size>()
 template <class _Tp, ::cuda::std::size_t _Size>
 inline constexpr bool __has_vector_type_v = !::cuda::std::is_same_v<__vector_type_t<_Tp, _Size>, void>;
 
+template <class _Tp>
+inline constexpr bool __is_vector_type_v = false;
+
+template <>
+inline constexpr bool __is_vector_type_v<::char1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::char2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::char3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::char4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::uchar1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uchar2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uchar3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uchar4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::short1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::short2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::short3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::short4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::ushort1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ushort2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ushort3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ushort4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::int1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::int2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::int3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::int4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::uint1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uint2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uint3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::uint4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::long1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::long2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::long3> = true;
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::long4_16a> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::long4_32a> = true;
+#  endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::long4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::ulong1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulong2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulong3> = true;
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::ulong4_16a> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulong4_32a> = true;
+#  endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::ulong4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::longlong1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::longlong2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::longlong3> = true;
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::longlong4_16a> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::longlong4_32a> = true;
+#  endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::longlong4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong3> = true;
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong4_16a> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong4_32a> = true;
+#  endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::ulonglong4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::float1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::float2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::float3> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::float4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::double1> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::double2> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::double3> = true;
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::double4_16a> = true;
+template <>
+inline constexpr bool __is_vector_type_v<::double4_32a> = true;
+#  endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0)
+template <>
+inline constexpr bool __is_vector_type_v<::double4> = true;
+
+template <>
+inline constexpr bool __is_vector_type_v<::dim3> = true;
+
+template <typename _Tp>
+inline constexpr bool __is_extended_fp_vector_type_v = false;
+
+#  if _CCCL_HAS_NVFP8()
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_bfloat162> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__half2> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e4m3> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e5m2> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e4m3> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e5m2> = true;
+#    if _CCCL_CTK_AT_LEAST(12, 8)
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e8m0> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e8m0> = true;
+#    endif // _CCCL_CTK_AT_LEAST(12, 8)
+#  endif // _CCCL_HAS_NVFP8()
+
+#  if _CCCL_HAS_NVFP6()
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e2m3> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e3m2> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e2m3> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e3m2> = true;
+#  endif // _CCCL_HAS_NVFP6()
+
+#  if _CCCL_HAS_NVFP4()
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x2_e2m1> = true;
+template <>
+inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x4_e2m1> = true;
+#  endif // _CCCL_HAS_NVFP4()
+
 _CCCL_END_NAMESPACE_CUDA
 
 #  include <cuda/std/__cccl/epilogue.h>
 
 #endif // !_CCCL_HAS_CTK()
-
 #endif // _CUDA__TYPE_TRAITS_VECTOR_TYPE_H
diff --git a/libcudacxx/include/cuda/mdspan b/libcudacxx/include/cuda/mdspan
index ae81a30219a..f8e36e75e43 100644
--- a/libcudacxx/include/cuda/mdspan
+++ b/libcudacxx/include/cuda/mdspan
@@ -21,7 +21,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__mdspan/dlpack_to_mdspan.h>
 #include <cuda/__mdspan/host_device_mdspan.h>
+#include <cuda/__mdspan/mdspan_to_dlpack.h>
 #include <cuda/__mdspan/restrict_mdspan.h>
 #include <cuda/__mdspan/shared_memory_mdspan.h>
 #include <cuda/std/mdspan>
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
index a44443ed091..125b6b87492 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
@@ -121,6 +121,26 @@ _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__half, __half, 2)
 #  if _CCCL_HAS_NVBF16()
 _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_bfloat16, __nv_bfloat16, 2)
 #  endif // _CCCL_HAS_NVBF16()
+#  if _CCCL_HAS_NVFP8()
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 2, _e5m2)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 4, _e5m2)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 2, _e4m3)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 4, _e4m3)
+#    if _CCCL_CTK_AT_LEAST(12, 8)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 2, _e8m0)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 4, _e8m0)
+#    endif // _CCCL_CTK_AT_LEAST(12, 8)
+#  endif // _CCCL_HAS_NVFP8()
+#  if _CCCL_HAS_NVFP6()
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 2, _e3m2)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 4, _e3m2)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 2, _e2m3)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 4, _e2m3)
+#  endif // _CCCL_HAS_NVFP6()
+#  if _CCCL_HAS_NVFP4()
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 2, _e2m1)
+_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 4, _e2m1)
+#  endif // _CCCL_HAS_NVFP4()
 
 template <size_t _Ip>
 struct __get_element;
@@ -273,6 +293,26 @@ _LIBCUDACXX_SPECIALIZE_GET(__half2, __half)
 #  if _CCCL_HAS_NVBF16()
 _LIBCUDACXX_SPECIALIZE_GET(__nv_bfloat162, __nv_bfloat16)
 #  endif // _CCCL_HAS_NVBF16()
+#  if _CCCL_HAS_NVFP8()
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e5m2, __nv_fp8_e5m2)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e5m2, __nv_fp8_e5m2)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e4m3, __nv_fp8_e4m3)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e4m3, __nv_fp8_e4m3)
+#    if _CCCL_CTK_AT_LEAST(12, 8)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e8m0, __nv_fp8_e8m0)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e8m0, __nv_fp8_e8m0)
+#    endif // _CCCL_CTK_AT_LEAST(12, 8)
+#  endif // _CCCL_HAS_NVFP8()
+#  if _CCCL_HAS_NVFP6()
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e3m2, __nv_fp6_e3m2)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e3m2, __nv_fp6_e3m2)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e2m3, __nv_fp6_e2m3)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e2m3, __nv_fp6_e2m3)
+#  endif // _CCCL_HAS_NVFP6()
+#  if _CCCL_HAS_NVFP4()
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x2_e2m1, __nv_fp4_e2m1)
+_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x4_e2m1, __nv_fp4_e2m1)
+#  endif // _CCCL_HAS_NVFP4()
 
 _CCCL_END_NAMESPACE_CUDA_STD
 
diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp
new file mode 100644
index 00000000000..c393b99fcab
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp
@@ -0,0 +1,679 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: nvrtc
+
+#include <cuda/mdspan>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include <nv/target>
+
+#include "test_macros.h"
+#include <dlpack/dlpack.h>
+
+template <size_t Rank>
+using dlpack_array = cuda::std::array<int64_t, Rank>;
+
+//==============================================================================
+// Test: Rank-0 mdspan conversion
+//==============================================================================
+
+bool test_rank0()
+{
+  float data = 42.0f;
+  DLTensor tensor{};
+  tensor.data   = &data;
+  tensor.device = DLDevice{kDLCPU, 0};
+  tensor.ndim   = 0;
+  tensor.dtype  = DLDataType{DLDataTypeCode::kDLFloat, 32, 1};
+
+  auto host_mdspan = cuda::to_host_mdspan<float, 0>(tensor);
+
+  assert(host_mdspan.rank() == 0);
+  assert(host_mdspan.size() == 1);
+  assert(host_mdspan.data_handle() == &data);
+  assert(host_mdspan() == 42.0f);
+  return true;
+}
+
+//==============================================================================
+// Test: Empty tensor (zero in one dimension)
+//==============================================================================
+
+bool test_empty_tensor()
+{
+  int dummy               = 0; // Non-null but won't be accessed
+  dlpack_array<2> shape   = {0, 5};
+  dlpack_array<2> strides = {5, 1}; // row-major
+  DLTensor tensor{};
+  tensor.data    = &dummy;
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  auto host_mdspan = cuda::to_host_mdspan<int, 2, cuda::std::layout_right>(tensor);
+
+  assert(host_mdspan.extent(0) == 0);
+  assert(host_mdspan.extent(1) == 5);
+  assert(host_mdspan.size() == 0);
+  assert(host_mdspan.empty());
+  return true;
+}
+
+//==============================================================================
+// Test: Rank-1 mdspan with layout_right (row-major)
+//==============================================================================
+
+bool test_rank1()
+{
+  cuda::std::array<int, 5> data = {1, 2, 3, 4, 5};
+  dlpack_array<1> shape         = {5};
+  dlpack_array<1> strides       = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = ::DLDataType{::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  auto host_mdspan_right  = cuda::to_host_mdspan<int, 1, cuda::std::layout_right>(tensor);
+  auto host_mdspan_left   = cuda::to_host_mdspan<int, 1, cuda::std::layout_left>(tensor);
+  auto host_mdspan_stride = cuda::to_host_mdspan<int, 1, cuda::std::layout_stride>(tensor);
+
+  assert(host_mdspan_right.rank() == 1);
+  assert(host_mdspan_right.extent(0) == 5);
+  assert(host_mdspan_right.stride(0) == 1);
+  for (int i = 0; i < 5; ++i)
+  {
+    assert(host_mdspan_right(i) == data[i]);
+  }
+  assert(host_mdspan_left.rank() == 1);
+  assert(host_mdspan_left.extent(0) == 5);
+  assert(host_mdspan_left.stride(0) == 1);
+  for (int i = 0; i < 5; ++i)
+  {
+    assert(host_mdspan_left(i) == data[i]);
+  }
+  assert(host_mdspan_stride.rank() == 1);
+  assert(host_mdspan_stride.extent(0) == 5);
+  assert(host_mdspan_stride.stride(0) == 1);
+  for (int i = 0; i < 5; ++i)
+  {
+    assert(host_mdspan_stride(i) == data[i]);
+  }
+  return true;
+}
+
+//==============================================================================
+// Test: Rank-2 mdspan with layout_right (row-major)
+//==============================================================================
+
+bool test_rank2_layout_right()
+{
+  // 2x3 matrix in row-major order
+  cuda::std::array<float, 6> data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  dlpack_array<2> shape           = {2, 3};
+  dlpack_array<2> strides         = {3, 1}; // row-major
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = cuda::__data_type_to_dlpack<float>();
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  auto host_mdspan = cuda::to_host_mdspan<float, 2, cuda::std::layout_right>(tensor);
+
+  assert(host_mdspan.rank() == 2);
+  assert(host_mdspan.extent(0) == 2);
+  assert(host_mdspan.extent(1) == 3);
+  assert(host_mdspan.stride(0) == 3); // row stride
+  assert(host_mdspan.stride(1) == 1); // column stride
+
+  // Check values: row-major layout
+  assert(host_mdspan(0, 0) == 1.0f);
+  assert(host_mdspan(0, 1) == 2.0f);
+  assert(host_mdspan(0, 2) == 3.0f);
+  assert(host_mdspan(1, 0) == 4.0f);
+  assert(host_mdspan(1, 1) == 5.0f);
+  assert(host_mdspan(1, 2) == 6.0f);
+  return true;
+}
+
+//==============================================================================
+// Test: Rank-2 mdspan with layout_left (column-major)
+//==============================================================================
+
+bool test_rank2_layout_left()
+{
+  // 2x3 matrix in column-major order
+  cuda::std::array<float, 6> data = {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f};
+  dlpack_array<2> shape           = {2, 3};
+  dlpack_array<2> strides         = {1, 2}; // column-major
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = cuda::__data_type_to_dlpack<float>();
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  auto host_mdspan = cuda::to_host_mdspan<float, 2, cuda::std::layout_left>(tensor);
+
+  assert(host_mdspan.rank() == 2);
+  assert(host_mdspan.extent(0) == 2);
+  assert(host_mdspan.extent(1) == 3);
+  assert(host_mdspan.stride(0) == 1); // row stride
+  assert(host_mdspan.stride(1) == 2); // column stride
+
+  // Check values: column-major layout
+  assert(host_mdspan(0, 0) == 1.0f);
+  assert(host_mdspan(0, 1) == 2.0f);
+  assert(host_mdspan(0, 2) == 3.0f);
+  assert(host_mdspan(1, 0) == 4.0f);
+  assert(host_mdspan(1, 1) == 5.0f);
+  assert(host_mdspan(1, 2) == 6.0f);
+  return true;
+}
+
+//==============================================================================
+// Test: Rank-2 mdspan with layout_stride (arbitrary strides)
+//==============================================================================
+
+bool test_rank2_layout_stride()
+{
+  // 2x3 matrix with custom strides (e.g., padded)
+  cuda::std::array<int, 8> data = {1, 2, 3, 0, 4, 5, 6, 0}; // Each row padded to 4 elements
+  dlpack_array<2> shape         = {2, 3};
+  dlpack_array<2> strides       = {4, 1}; // Row stride = 4 (padded), col stride = 1
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = cuda::__data_type_to_dlpack<int>();
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  auto host_mdspan = cuda::to_host_mdspan<int, 2, cuda::std::layout_stride>(tensor);
+
+  assert(host_mdspan.rank() == 2);
+  assert(host_mdspan.extent(0) == 2);
+  assert(host_mdspan.extent(1) == 3);
+  assert(host_mdspan.stride(0) == 4);
+  assert(host_mdspan.stride(1) == 1);
+
+  assert(host_mdspan(0, 0) == 1);
+  assert(host_mdspan(0, 1) == 2);
+  assert(host_mdspan(0, 2) == 3);
+  assert(host_mdspan(1, 0) == 4);
+  assert(host_mdspan(1, 1) == 5);
+  assert(host_mdspan(1, 2) == 6);
+  return true;
+}
+
+//==============================================================================
+// Test: layout_stride with default (layout_right) strides when strides is nullptr
+// Note: This tests the fallback behavior for DLPack < 1.2
+//==============================================================================
+
+#if !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)
+
+bool test_layout_stride_null_strides()
+{
+  cuda::std::array<float, 6> data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  dlpack_array<2> shape           = {2, 3};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = cuda::__data_type_to_dlpack<float>();
+  tensor.shape   = shape.data();
+  tensor.strides = nullptr; // null strides
+
+  auto host_mdspan = cuda::to_host_mdspan<float, 2, cuda::std::layout_stride>(tensor);
+
+  // Should use row-major strides by default
+  assert(host_mdspan.stride(0) == 3);
+  assert(host_mdspan.stride(1) == 1);
+  return true;
+}
+
+#endif // !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)
+
+//==============================================================================
+// Test: byte_offset support
+//==============================================================================
+
+bool test_byte_offset()
+{
+  cuda::std::array<int, 8> data = {0, 0, 1, 2, 3, 4, 5, 6};
+  // Skip first 2 ints (8 bytes)
+  dlpack_array<1> shape   = {6};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data        = data.data();
+  tensor.device      = DLDevice{kDLCPU, 0};
+  tensor.ndim        = 1;
+  tensor.dtype       = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape       = shape.data();
+  tensor.strides     = strides.data();
+  tensor.byte_offset = sizeof(int) * 2;
+
+  auto host_mdspan = cuda::to_host_mdspan<int, 1>(tensor);
+
+  assert(host_mdspan.extent(0) == 6);
+  assert(host_mdspan(0) == 1);
+  assert(host_mdspan(5) == 6);
+  return true;
+}
+
+//==============================================================================
+// Exception tests
+//==============================================================================
+
+void test_exception_wrong_rank()
+{
+  cuda::std::array<int, 6> data{};
+  dlpack_array<2> shape   = {2, 3};
+  dlpack_array<2> strides = {3, 1}; // row-major
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    // Try to convert rank-2 tensor to rank-1 mdspan
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_wrong_dtype()
+{
+  cuda::std::array<int, 4> data{};
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; // dtype is int
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    // Try to convert int tensor to float mdspan
+    unused(cuda::to_host_mdspan<float, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_null_data()
+{
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = nullptr;
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_null_shape()
+{
+  cuda::std::array<int, 4> data{};
+  DLTensor tensor{};
+  tensor.data   = data.data();
+  tensor.device = DLDevice{kDLCPU, 0};
+  tensor.ndim   = 1;
+  tensor.dtype  = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape  = nullptr; // null shape
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_negative_shape()
+{
+  cuda::std::array<int, 4> data{};
+  dlpack_array<1> shape   = {-3}; // negative shape
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_wrong_device_type_host()
+{
+  cuda::std::array<int, 4> data{};
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{::kDLCUDA, 0}; // CUDA device, not CPU
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_wrong_device_type_device()
+{
+  cuda::std::array<int, 4> data{};
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0}; // CPU device, not CUDA
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_device_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_wrong_device_type_managed()
+{
+  cuda::std::array<int, 4> data{};
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0}; // CPU device, not CUDA managed
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_managed_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_stride_mismatch_layout_right()
+{
+  cuda::std::array<float, 6> data{};
+  dlpack_array<2> shape   = {2, 3};
+  dlpack_array<2> strides = {1, 2}; // Column-major, not row-major
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLFloat, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<float, 2, cuda::std::layout_right>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_stride_mismatch_layout_left()
+{
+  cuda::std::array<float, 6> data{};
+  dlpack_array<2> shape   = {2, 3};
+  dlpack_array<2> strides = {3, 1}; // Row-major, not column-major
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLFloat, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<float, 2, cuda::std::layout_left>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_zero_stride_layout_stride()
+{
+  cuda::std::array<int, 6> data{};
+  dlpack_array<2> shape   = {2, 3};
+  dlpack_array<2> strides = {0, 1}; // Zero stride is invalid
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 2, cuda::std::layout_stride>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_null_strides_dlpack_v12()
+{
+  cuda::std::array<float, 6> data{};
+  dlpack_array<2> shape = {2, 3};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 2;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLFloat, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = nullptr; // null strides not allowed in DLPack v1.2+
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<float, 2>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+void test_exception_misaligned_data()
+{
+  // Create a buffer that allows us to get a misaligned pointer
+  alignas(16) cuda::std::array<char, 20> buffer{};
+  // Get a pointer that's 1 byte into the buffer (misaligned for int)
+  auto misaligned_ptr     = reinterpret_cast<int*>(buffer.data() + 1);
+  dlpack_array<1> shape   = {3};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = misaligned_ptr;
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = DLDataType{DLDataTypeCode::kDLInt, 32, 1};
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  bool caught = false;
+  try
+  {
+    unused(cuda::to_host_mdspan<int, 1>(tensor));
+  }
+  catch (const std::invalid_argument&)
+  {
+    caught = true;
+  }
+  assert(caught);
+}
+
+bool test_exceptions()
+{
+  test_exception_wrong_rank();
+  test_exception_wrong_dtype();
+  test_exception_null_data();
+  test_exception_null_shape();
+  test_exception_negative_shape();
+  test_exception_wrong_device_type_host();
+  test_exception_wrong_device_type_device();
+  test_exception_wrong_device_type_managed();
+  test_exception_stride_mismatch_layout_right();
+  test_exception_stride_mismatch_layout_left();
+  test_exception_zero_stride_layout_stride();
+#if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)
+  test_exception_null_strides_dlpack_v12();
+#endif
+  test_exception_misaligned_data();
+  return true;
+}
+
+//==============================================================================
+// Test: Return type checking
+//==============================================================================
+
+bool test_return_types()
+{
+  cuda::std::array<float, 4> data{};
+  dlpack_array<1> shape   = {4};
+  dlpack_array<1> strides = {1};
+  DLTensor tensor{};
+  tensor.data    = data.data();
+  tensor.device  = DLDevice{kDLCPU, 0};
+  tensor.ndim    = 1;
+  tensor.dtype   = cuda::__data_type_to_dlpack<float>();
+  tensor.shape   = shape.data();
+  tensor.strides = strides.data();
+
+  // Check return type of to_host_mdspan
+  auto host_ms = cuda::to_host_mdspan<float, 1>(tensor);
+  static_assert(
+    cuda::std::is_same_v<decltype(host_ms),
+                         cuda::host_mdspan<float, cuda::std::dextents<int64_t, 1>, cuda::std::layout_stride>>);
+  assert(host_ms.extent(0) == 4);
+
+  auto host_ms_right = cuda::to_host_mdspan<float, 1, cuda::std::layout_right>(tensor);
+  static_assert(
+    cuda::std::is_same_v<decltype(host_ms_right),
+                         cuda::host_mdspan<float, cuda::std::dextents<int64_t, 1>, cuda::std::layout_right>>);
+  assert(host_ms_right.extent(0) == 4);
+  return true;
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (assert(test_rank0()); //
+     assert(test_empty_tensor());
+     assert(test_rank1());
+     assert(test_rank2_layout_right());
+     assert(test_rank2_layout_left());
+     assert(test_rank2_layout_stride());
+     assert(test_byte_offset());
+     assert(test_return_types());
+     assert(test_exceptions());))
+#if !(DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_layout_stride_null_strides());))
+#endif
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp
new file mode 100644
index 00000000000..b066a9b2569
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp
@@ -0,0 +1,527 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: nvrtc
+
+#include <cuda/mdspan>
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/utility>
+
+#include "test_macros.h"
+
+void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes)
+{
+  assert(dt.code == code);
+  assert(dt.bits == bits);
+  assert(dt.lanes == lanes);
+}
+
+bool test_mdspan_to_dlpack_host_layout_right()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data[6]     = {0, 1, 2, 3, 4, 5};
+  cuda::host_mdspan<int, extents_t> md{data, extents_t{}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCPU);
+  assert(dltensor.device.device_id == 0);
+  assert(dltensor.ndim == 2);
+  check_datatype(dltensor.dtype, kDLInt, 32, 1);
+  assert(dltensor.shape != nullptr);
+  assert(dltensor.strides != nullptr);
+  assert(dltensor.shape[0] == 2);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.strides[0] == 3);
+  assert(dltensor.strides[1] == 1);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data);
+  return true;
+}
+
+bool test_mdspan_to_dlpack_host_layout_left()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data[6]     = {0, 1, 2, 3, 4, 5};
+  cuda::host_mdspan<int, extents_t, cuda::std::layout_left> md{data, extents_t{}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCPU);
+  assert(dltensor.device.device_id == 0);
+  check_datatype(dltensor.dtype, kDLInt, 32, 1);
+  assert(dltensor.ndim == 2);
+  assert(dltensor.shape != nullptr);
+  assert(dltensor.strides != nullptr);
+  assert(dltensor.shape[0] == 2);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.strides[0] == 1);
+  assert(dltensor.strides[1] == 2);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data);
+  return true;
+}
+
+bool test_mdspan_to_dlpack_empty_size()
+{
+  using extents_t = cuda::std::dims<2>;
+  int data[1]     = {42};
+  cuda::host_mdspan<int, extents_t> m{data, extents_t{0, 3}};
+  auto dlpack_wrapper  = cuda::to_dlpack(m);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCPU);
+  assert(dltensor.device.device_id == 0);
+  check_datatype(dltensor.dtype, kDLInt, 32, 1);
+  assert(dltensor.ndim == 2);
+  assert(dltensor.shape[0] == 0);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.strides[0] == 3);
+  assert(dltensor.strides[1] == 1);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == nullptr); // size() == 0 => nullptr
+  return true;
+}
+
+bool test_mdspan_to_dlpack_rank_0()
+{
+  using extents_t = cuda::std::extents<size_t>;
+  int data[1]     = {7};
+  cuda::host_mdspan<int, extents_t> md{data, extents_t{}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCPU);
+  assert(dltensor.device.device_id == 0);
+  check_datatype(dltensor.dtype, kDLInt, 32, 1);
+  assert(dltensor.ndim == 0);
+  assert(dltensor.shape == nullptr);
+  assert(dltensor.strides == nullptr);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data); // rank-0 mdspan has size() == 1
+  return true;
+}
+
+bool test_mdspan_to_dlpack_const_pointer()
+{
+  using extents_t   = cuda::std::dims<3>;
+  const int data[6] = {0, 1, 2, 3, 4, 5};
+  cuda::host_mdspan<const int, extents_t> md{data, extents_t{2, 3, 4}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCPU);
+  assert(dltensor.device.device_id == 0);
+  check_datatype(dltensor.dtype, kDLInt, 32, 1);
+  assert(dltensor.ndim == 3);
+  assert(dltensor.shape[0] == 2);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.shape[2] == 4);
+  assert(dltensor.strides[0] == 12);
+  assert(dltensor.strides[1] == 4);
+  assert(dltensor.strides[2] == 1);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data); // rank-0 mdspan has size() == 1
+  return true;
+}
+
+bool test_mdspan_to_dlpack_device()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  float* data     = nullptr;
+  assert(cudaMalloc(&data, 6 * sizeof(float)) == cudaSuccess);
+  cuda::device_mdspan<float, extents_t> md{data, extents_t{}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md, cuda::device_ref{0});
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCUDA);
+  assert(dltensor.device.device_id == 0);
+  assert(dltensor.ndim == 2);
+  check_datatype(dltensor.dtype, kDLFloat, 32, 1);
+  assert(dltensor.shape[0] == 2);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.strides[0] == 3);
+  assert(dltensor.strides[1] == 1);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data);
+  return true;
+}
+
+bool test_mdspan_to_dlpack_managed()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  float* data     = nullptr;
+  assert(cudaMallocManaged(&data, 6 * sizeof(float)) == cudaSuccess);
+  cuda::managed_mdspan<float, extents_t> md{data, extents_t{}};
+  auto dlpack_wrapper  = cuda::to_dlpack(md);
+  const auto& dltensor = dlpack_wrapper.get();
+
+  assert(dltensor.device.device_type == kDLCUDAManaged);
+  assert(dltensor.device.device_id == 0);
+  assert(dltensor.ndim == 2);
+  check_datatype(dltensor.dtype, kDLFloat, 32, 1);
+  assert(dltensor.shape[0] == 2);
+  assert(dltensor.shape[1] == 3);
+  assert(dltensor.strides[0] == 3);
+  assert(dltensor.strides[1] == 1);
+  assert(dltensor.byte_offset == 0);
+  assert(dltensor.data == data);
+  return true;
+}
+
+template <typename ListT>
+struct test_mdspan_to_dlpack_types_fn
+{
+  using list_t = ListT;
+
+  cuda::std::array<DLDataType, list_t::__size> expected_types;
+
+  template <size_t index>
+  void call_impl() const
+  {
+    using T         = cuda::std::__type_at_c<index, list_t>;
+    using extents_t = cuda::std::extents<size_t, 2, 3>;
+    T* data         = nullptr;
+    cuda::host_mdspan<T, extents_t> md{data, extents_t{}};
+    auto dlpack_wrapper  = cuda::to_dlpack(md);
+    const auto& dltensor = dlpack_wrapper.get();
+
+    auto type = expected_types[index];
+    check_datatype(dltensor.dtype, type.code, type.bits, type.lanes);
+  }
+
+  template <size_t... Indices>
+  void call(cuda::std::index_sequence<Indices...>) const
+  {
+    (call_impl<Indices>(), ...);
+  }
+};
+
+bool test_mdspan_to_dlpack_basic_types()
+{
+  using list_t = cuda::std::__type_list<
+    bool,
+    signed char,
+    short,
+    int,
+    long,
+    long long,
+#if _CCCL_HAS_INT128()
+    __int128_t,
+#endif
+    // Unsigned integer types
+    unsigned char,
+    unsigned short,
+    unsigned int,
+    unsigned long,
+    unsigned long long,
+#if _CCCL_HAS_INT128()
+    __uint128_t,
+#endif
+    // Floating-point types
+    float,
+    double
+#if _CCCL_HAS_FLOAT128()
+    ,
+    __float128
+#endif
+    >;
+  cuda::std::array<DLDataType, list_t::__size> expected_types = {
+    DLDataType{kDLBool, 8, 1},
+    // Signed integer types
+    DLDataType{kDLInt, 8, 1},
+    DLDataType{kDLInt, 16, 1},
+    DLDataType{kDLInt, 32, 1},
+    DLDataType{kDLInt, sizeof(long) * 8, 1},
+    DLDataType{kDLInt, 64, 1},
+#if _CCCL_HAS_INT128()
+    DLDataType{kDLInt, 128, 1},
+#endif
+    // Unsigned integer types
+    DLDataType{kDLUInt, 8, 1},
+    DLDataType{kDLUInt, 16, 1},
+    DLDataType{kDLUInt, 32, 1},
+    DLDataType{kDLUInt, sizeof(unsigned long) * 8, 1},
+    DLDataType{kDLUInt, 64, 1},
+#if _CCCL_HAS_INT128()
+    DLDataType{kDLUInt, 128, 1},
+#endif
+    // Floating-point types
+    DLDataType{kDLFloat, 32, 1},
+    DLDataType{kDLFloat, 64, 1},
+#if _CCCL_HAS_FLOAT128()
+    DLDataType{kDLFloat, 128, 1},
+#endif
+  };
+  test_mdspan_to_dlpack_types_fn<list_t> test_fn{expected_types};
+  test_fn.call(cuda::std::make_index_sequence<list_t::__size - 1>{});
+  return true;
+}
+
+bool test_mdspan_to_dlpack_extended_fp_and_complex_types()
+{
+  using list_t = cuda::std::__type_list<
+#if _CCCL_HAS_NVFP16()
+    ::__half,
+#endif
+#if _CCCL_HAS_NVBF16()
+    ::__nv_bfloat16,
+#endif
+#if _CCCL_HAS_FLOAT128()
+    __float128,
+#endif
+  // Low-precision floating-point types
+#if _CCCL_HAS_NVFP8_E4M3()
+    ::__nv_fp8_e4m3,
+#endif
+#if _CCCL_HAS_NVFP8_E5M2()
+    ::__nv_fp8_e5m2,
+#endif
+#if _CCCL_HAS_NVFP8_E8M0()
+    ::__nv_fp8_e8m0,
+#endif
+#if _CCCL_HAS_NVFP6_E2M3()
+    ::__nv_fp6_e2m3,
+#endif
+#if _CCCL_HAS_NVFP6_E3M2()
+    ::__nv_fp6_e3m2,
+#endif
+#if _CCCL_HAS_NVFP4_E2M1()
+    ::__nv_fp4_e2m1,
+#endif
+  // Complex types
+#if _CCCL_HAS_NVFP16()
+    cuda::std::complex<::__half>,
+#endif
+    cuda::std::complex<float>,
+    cuda::std::complex<double>>;
+  cuda::std::array<DLDataType, list_t::__size> expected_types = {
+#if _CCCL_HAS_NVFP16()
+    DLDataType{kDLFloat, 16, 1},
+#endif
+#if _CCCL_HAS_NVBF16()
+    DLDataType{kDLBfloat, 16, 1},
+#endif
+#if _CCCL_HAS_FLOAT128()
+    DLDataType{kDLFloat, 128, 1},
+#endif
+  // Low-precision floating-point types
+#if _CCCL_HAS_NVFP8_E4M3()
+    DLDataType{kDLFloat8_e4m3fn, 8, 1},
+#endif
+#if _CCCL_HAS_NVFP8_E5M2()
+    DLDataType{kDLFloat8_e5m2, 8, 1},
+#endif
+#if _CCCL_HAS_NVFP8_E8M0()
+    DLDataType{kDLFloat8_e8m0fnu, 8, 1},
+#endif
+#if _CCCL_HAS_NVFP6_E2M3()
+    DLDataType{kDLFloat6_e2m3fn, 6, 1},
+#endif
+#if _CCCL_HAS_NVFP6_E3M2()
+    DLDataType{kDLFloat6_e3m2fn, 6, 1},
+#endif
+#if _CCCL_HAS_NVFP4_E2M1()
+    DLDataType{kDLFloat4_e2m1fn, 4, 1},
+#endif
+  // Complex types
+#if _CCCL_HAS_NVFP16()
+    DLDataType{kDLComplex, 32, 1},
+#endif
+    DLDataType{kDLComplex, 64, 1},
+    DLDataType{kDLComplex, 128, 1}};
+  test_mdspan_to_dlpack_types_fn<list_t> test_fn{expected_types};
+  test_fn.call(cuda::std::make_index_sequence<list_t::__size>{});
+  return true;
+}
+
+#if _CCCL_HAS_CTK()
+bool test_mdspan_to_dlpack_vector_types()
+{
+  using list_t = cuda::std::__type_list<
+    ::char2,
+    ::char4,
+    ::uchar2,
+    ::uchar4,
+    ::short2,
+    ::short4,
+    ::ushort2,
+    ::ushort4,
+    ::int2,
+    ::int4,
+    ::uint2,
+    ::uint4,
+    ::long2
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    ,
+    ::long4_32a
+#  else
+    ,
+    ::long4
+#  endif
+    ,
+    ::ulong2
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    ,
+    ::ulong4_32a
+#  else
+    ,
+    ::ulong4
+#  endif
+    ,
+    ::longlong2
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    ,
+    ::longlong4_32a
+#  else
+    ,
+    ::longlong4
+#  endif
+    ,
+    ::ulonglong2
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    ,
+    ::ulonglong4_32a
+#  else
+    ,
+    ::ulonglong4
+#  endif
+    ,
+    ::float2,
+    ::float4,
+    ::double2
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    ,
+    ::double4_32a
+#  else
+    ,
+    ::double4
+#  endif
+    >;
+  cuda::std::array<DLDataType, list_t::__size> expected_types = {
+    DLDataType{kDLInt, 8, 2},
+    DLDataType{kDLInt, 8, 4},
+    DLDataType{kDLUInt, 8, 2},
+    DLDataType{kDLUInt, 8, 4},
+    DLDataType{kDLInt, 16, 2},
+    DLDataType{kDLInt, 16, 4},
+    DLDataType{kDLUInt, 16, 2},
+    DLDataType{kDLUInt, 16, 4},
+    DLDataType{kDLInt, 32, 2},
+    DLDataType{kDLInt, 32, 4},
+    DLDataType{kDLUInt, 32, 2},
+    DLDataType{kDLUInt, 32, 4},
+    DLDataType{kDLInt, sizeof(long) * 8, 2},
+    DLDataType{kDLInt, sizeof(long) * 8, 4},
+    DLDataType{kDLUInt, sizeof(unsigned long) * 8, 2},
+    DLDataType{kDLUInt, sizeof(unsigned long) * 8, 4},
+    DLDataType{kDLInt, 64, 2},
+    DLDataType{kDLInt, 64, 4},
+    DLDataType{kDLUInt, 64, 2},
+    DLDataType{kDLUInt, 64, 4},
+    DLDataType{kDLFloat, 32, 2},
+    DLDataType{kDLFloat, 32, 4},
+    DLDataType{kDLFloat, 64, 2},
+    DLDataType{kDLFloat, 64, 4}};
+  test_mdspan_to_dlpack_types_fn<list_t> test_fn{expected_types};
+  test_fn.call(cuda::std::make_index_sequence<list_t::__size - 1>{});
+  return true;
+}
+#endif // _CCCL_HAS_CTK()
+
+bool test_mdspan_to_dlpack_extended_fp_vector_types()
+{
+  using list_t = cuda::std::__type_list<
+#if _CCCL_HAS_NVFP16()
+    ::__half2,
+#endif
+#if _CCCL_HAS_NVBF16()
+    ::__nv_bfloat162,
+#endif
+#if _CCCL_HAS_NVFP8_E4M3()
+    ::__nv_fp8x2_e4m3,
+    ::__nv_fp8x4_e4m3,
+#endif
+#if _CCCL_HAS_NVFP8_E5M2()
+    ::__nv_fp8x2_e5m2,
+    ::__nv_fp8x4_e5m2,
+#endif
+#if _CCCL_HAS_NVFP8_E8M0()
+    ::__nv_fp8x2_e8m0,
+    ::__nv_fp8x4_e8m0,
+#endif
+#if _CCCL_HAS_NVFP6_E2M3()
+    ::__nv_fp6x2_e2m3,
+    ::__nv_fp6x4_e2m3,
+#endif
+#if _CCCL_HAS_NVFP6_E3M2()
+    ::__nv_fp6x2_e3m2,
+    ::__nv_fp6x4_e3m2,
+#endif
+#if _CCCL_HAS_NVFP4_E2M1()
+    ::__nv_fp4x2_e2m1,
+    ::__nv_fp4x4_e2m1,
+#endif
+    void* /* dummy to allow trailing commas */>;
+  cuda::std::array<DLDataType, list_t::__size> expected_types = {
+#if _CCCL_HAS_NVFP16()
+    DLDataType{kDLFloat, 16, 2},
+#endif
+#if _CCCL_HAS_NVBF16()
+    DLDataType{kDLBfloat, 16, 2},
+#endif
+#if _CCCL_HAS_NVFP8_E4M3()
+    DLDataType{kDLFloat8_e4m3fn, 8, 2},
+    DLDataType{kDLFloat8_e4m3fn, 8, 4},
+#endif
+#if _CCCL_HAS_NVFP8_E5M2()
+    DLDataType{kDLFloat8_e5m2, 8, 2},
+    DLDataType{kDLFloat8_e5m2, 8, 4},
+#endif
+#if _CCCL_HAS_NVFP8_E8M0()
+    DLDataType{kDLFloat8_e8m0fnu, 8, 2},
+    DLDataType{kDLFloat8_e8m0fnu, 8, 4},
+#endif
+#if _CCCL_HAS_NVFP6_E2M3()
+    DLDataType{kDLFloat6_e2m3fn, 6, 2},
+    DLDataType{kDLFloat6_e2m3fn, 6, 4},
+#endif
+#if _CCCL_HAS_NVFP6_E3M2()
+    DLDataType{kDLFloat6_e3m2fn, 6, 2},
+    DLDataType{kDLFloat6_e3m2fn, 6, 4},
+#endif
+#if _CCCL_HAS_NVFP4_E2M1()
+    DLDataType{kDLFloat4_e2m1fn, 4, 2},
+    DLDataType{kDLFloat4_e2m1fn, 4, 4},
+#endif
+  };
+  test_mdspan_to_dlpack_types_fn<list_t> test_fn{expected_types};
+  test_fn.call(cuda::std::make_index_sequence<list_t::__size - 1>{});
+  return true;
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_right());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_left());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_empty_size());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_rank_0());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_const_pointer());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_device());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_managed());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_basic_types());))
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_and_complex_types());))
+#if _CCCL_HAS_CTK()
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_vector_types());))
+#endif // _CCCL_HAS_CTK()
+  NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_vector_types());))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp
new file mode 100644
index 00000000000..e54cf6c93b6
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp
@@ -0,0 +1,214 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: nvrtc
+
+#include <cuda/mdspan>
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/utility>
+
+#include "test_macros.h"
+
+void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes)
+{
+  assert(dt.code == code);
+  assert(dt.bits == bits);
+  assert(dt.lanes == lanes);
+}
+
+bool test_mdspan_to_dlpack_wrapper_default_ctor()
+{
+  cuda::__dlpack_tensor<3> dlpack_wrapper{};
+  DLDataType default_dtype = {};
+  DLDevice default_device  = {};
+  auto& tensor             = dlpack_wrapper.get();
+  assert(tensor.device.device_type == default_device.device_type);
+  assert(tensor.device.device_id == default_device.device_id);
+  check_datatype(tensor.dtype, default_dtype.code, default_dtype.bits, default_dtype.lanes);
+  assert(tensor.shape != nullptr);
+  assert(tensor.strides != nullptr);
+  return true;
+}
+
+bool test_dlpack_wrapper_copy_ctor()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data[6]     = {0, 1, 2, 3, 4, 5};
+  cuda::host_mdspan<int, extents_t> md{data, extents_t{}};
+  auto w            = cuda::to_dlpack(md);
+  auto& t           = w.get();
+  auto* shape_ptr   = t.shape;
+  auto* strides_ptr = t.strides;
+
+  auto w2 = w; // copy construct
+  // Copy must not alias the source wrapper's shape/stride storage.
+  auto& t2 = w2.get();
+  assert(t2.shape != nullptr);
+  assert(t2.strides != nullptr);
+  assert(t2.shape != shape_ptr);
+  assert(t2.strides != strides_ptr);
+
+  // Source wrapper must remain intact.
+  assert(t.shape == shape_ptr);
+  assert(t.strides == strides_ptr);
+
+  // Sanity-check copied tensor metadata and values.
+  assert(t2.device.device_type == kDLCPU);
+  assert(t2.device.device_id == 0);
+  assert(t2.ndim == 2);
+  check_datatype(t2.dtype, kDLInt, 32, 1);
+  assert(t2.shape[0] == 2);
+  assert(t2.shape[1] == 3);
+  assert(t2.strides[0] == 3);
+  assert(t2.strides[1] == 1);
+  assert(t2.byte_offset == 0);
+  assert(t2.data == data);
+  return true;
+}
+
+bool test_dlpack_wrapper_move_ctor()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data[6]     = {0, 1, 2, 3, 4, 5};
+  cuda::host_mdspan<int, extents_t> md{data, extents_t{}};
+  auto w            = cuda::to_dlpack(md);
+  auto& t           = w.get();
+  auto* shape_ptr   = t.shape;
+  auto* strides_ptr = t.strides;
+  auto moved        = cuda::std::move(w); // move construct
+
+  // Moved-to wrapper must not keep pointers to moved-from storage.
+  auto& tm = moved.get();
+  assert(tm.shape != nullptr);
+  assert(tm.strides != nullptr);
+  assert(tm.shape != shape_ptr);
+  assert(tm.strides != strides_ptr);
+
+  // Moved-from wrapper is explicitly reset to a default/empty DLTensor.
+  assert(t.shape == nullptr);
+  assert(t.strides == nullptr);
+  assert(t.data == nullptr);
+  assert(t.ndim == 0);
+
+  // Sanity-check moved-to tensor metadata and values.
+  assert(tm.device.device_type == kDLCPU);
+  assert(tm.device.device_id == 0);
+  assert(tm.ndim == 2);
+  check_datatype(tm.dtype, kDLInt, 32, 1);
+  assert(tm.shape[0] == 2);
+  assert(tm.shape[1] == 3);
+  assert(tm.strides[0] == 3);
+  assert(tm.strides[1] == 1);
+  assert(tm.byte_offset == 0);
+  assert(tm.data == data);
+  return true;
+}
+
+bool test_dlpack_wrapper_copy_assignment()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data_a[6]   = {0, 1, 2, 3, 4, 5};
+  int data_b[6]   = {6, 7, 8, 9, 10, 11};
+  cuda::host_mdspan<int, extents_t> md_a{data_a, extents_t{}};
+  cuda::host_mdspan<int, extents_t> md_b{data_b, extents_t{}};
+  auto a              = cuda::to_dlpack(md_a);
+  auto b              = cuda::to_dlpack(md_b);
+  auto& ta            = a.get();
+  auto& tb            = b.get();
+  auto* b_shape_ptr   = tb.shape;
+  auto* b_strides_ptr = tb.strides;
+
+  b = a; // copy assign
+  // Destination must keep pointing to its own member arrays (not to `a`).
+  assert(tb.shape == b_shape_ptr);
+  assert(tb.strides == b_strides_ptr);
+  assert(tb.shape != ta.shape);
+  assert(tb.strides != ta.strides);
+
+  // Values must be copied correctly.
+  assert(tb.data == data_a);
+  assert(tb.ndim == 2);
+  assert(tb.shape[0] == 2);
+  assert(tb.shape[1] == 3);
+  assert(tb.strides[0] == 3);
+  assert(tb.strides[1] == 1);
+  return true;
+}
+
+bool test_dlpack_wrapper_move_assignment()
+{
+  using extents_t = cuda::std::extents<size_t, 2, 3>;
+  int data_a[6]   = {0, 1, 2, 3, 4, 5};
+  int data_b[6]   = {6, 7, 8, 9, 10, 11};
+  cuda::host_mdspan<int, extents_t> md_a{data_a, extents_t{}};
+  cuda::host_mdspan<int, extents_t> md_b{data_b, extents_t{}};
+  auto a              = cuda::to_dlpack(md_a);
+  auto b              = cuda::to_dlpack(md_b);
+  auto& ta            = a.get();
+  auto& tb            = b.get();
+  auto* a_shape_ptr   = ta.shape;
+  auto* a_strides_ptr = ta.strides;
+  auto* b_shape_ptr   = tb.shape;
+  auto* b_strides_ptr = tb.strides;
+
+  b = cuda::std::move(a); // move assign
+  // Destination must keep pointing to its own member arrays, not the source's.
+  assert(tb.shape == b_shape_ptr);
+  assert(tb.strides == b_strides_ptr);
+  assert(tb.shape != a_shape_ptr);
+  assert(tb.strides != a_strides_ptr);
+
+  // Source must be reset.
+  assert(ta.shape == nullptr);
+  assert(ta.strides == nullptr);
+  assert(ta.data == nullptr);
+  assert(ta.ndim == 0);
+
+  // Values must be moved correctly.
+  assert(tb.data == data_a);
+  assert(tb.ndim == 2);
+  assert(tb.shape[0] == 2);
+  assert(tb.shape[1] == 3);
+  assert(tb.strides[0] == 3);
+  assert(tb.strides[1] == 1);
+  return true;
+}
+
+bool test_dlpack_wrapper_get()
+{
+  using wrapper_t = cuda::__dlpack_tensor<2>;
+  static_assert(cuda::std::is_same_v<decltype(cuda::std::declval<wrapper_t&>().get()), ::DLTensor&>);
+  static_assert(cuda::std::is_same_v<decltype(cuda::std::declval<const wrapper_t&>().get()), const ::DLTensor&>);
+
+  wrapper_t w{};
+  // Mutating through the reference returned by `get()` must be observable.
+  auto& t = w.get();
+  t.ndim  = 123;
+  assert(w.get().ndim == 123);
+
+  // Const overload should also alias the same underlying object.
+  const wrapper_t& cw = w;
+  assert(&cw.get() == &w.get());
+  return true;
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (assert(test_mdspan_to_dlpack_wrapper_default_ctor()); assert(test_dlpack_wrapper_copy_ctor());
+     assert(test_dlpack_wrapper_move_ctor());
+     assert(test_dlpack_wrapper_copy_assignment());
+     assert(test_dlpack_wrapper_move_assignment());
+     assert(test_dlpack_wrapper_get());))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp
index 406b6157cbf..1561857be02 100644
--- a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp
@@ -29,31 +29,61 @@ __host__ __device__ void test()
   test<signed char, 3, char3>();
   test<signed char, 4, char4>();
 
+  static_assert(cuda::__is_vector_type_v<char1>);
+  static_assert(cuda::__is_vector_type_v<char2>);
+  static_assert(cuda::__is_vector_type_v<char3>);
+  static_assert(cuda::__is_vector_type_v<char4>);
+
   test<unsigned char, 1, uchar1>();
   test<unsigned char, 2, uchar2>();
   test<unsigned char, 3, uchar3>();
   test<unsigned char, 4, uchar4>();
 
+  static_assert(cuda::__is_vector_type_v<uchar1>);
+  static_assert(cuda::__is_vector_type_v<uchar2>);
+  static_assert(cuda::__is_vector_type_v<uchar3>);
+  static_assert(cuda::__is_vector_type_v<uchar4>);
+
   test<signed short, 1, short1>();
   test<signed short, 2, short2>();
   test<signed short, 3, short3>();
   test<signed short, 4, short4>();
 
+  static_assert(cuda::__is_vector_type_v<short1>);
+  static_assert(cuda::__is_vector_type_v<short2>);
+  static_assert(cuda::__is_vector_type_v<short3>);
+  static_assert(cuda::__is_vector_type_v<short4>);
+
   test<unsigned short, 1, ushort1>();
   test<unsigned short, 2, ushort2>();
   test<unsigned short, 3, ushort3>();
   test<unsigned short, 4, ushort4>();
 
+  static_assert(cuda::__is_vector_type_v<ushort1>);
+  static_assert(cuda::__is_vector_type_v<ushort2>);
+  static_assert(cuda::__is_vector_type_v<ushort3>);
+  static_assert(cuda::__is_vector_type_v<ushort4>);
+
   test<signed int, 1, int1>();
   test<signed int, 2, int2>();
   test<signed int, 3, int3>();
   test<signed int, 4, int4>();
 
+  static_assert(cuda::__is_vector_type_v<int1>);
+  static_assert(cuda::__is_vector_type_v<int2>);
+  static_assert(cuda::__is_vector_type_v<int3>);
+  static_assert(cuda::__is_vector_type_v<int4>);
+
   test<unsigned int, 1, uint1>();
   test<unsigned int, 2, uint2>();
   test<unsigned int, 3, uint3>();
   test<unsigned int, 4, uint4>();
 
+  static_assert(cuda::__is_vector_type_v<uint1>);
+  static_assert(cuda::__is_vector_type_v<uint2>);
+  static_assert(cuda::__is_vector_type_v<uint3>);
+  static_assert(cuda::__is_vector_type_v<uint4>);
+
   test<signed long, 1, long1>();
   test<signed long, 2, long2>();
   test<signed long, 3, long3>();
@@ -63,6 +93,16 @@ __host__ __device__ void test()
   test<signed long, 4, long4>();
 #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
+  static_assert(cuda::__is_vector_type_v<long1>);
+  static_assert(cuda::__is_vector_type_v<long2>);
+  static_assert(cuda::__is_vector_type_v<long3>);
+#if _CCCL_CTK_AT_LEAST(13, 0)
+  static_assert(cuda::__is_vector_type_v<long4_16a>);
+  static_assert(cuda::__is_vector_type_v<long4_32a>);
+#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+  static_assert(cuda::__is_vector_type_v<long4>);
+#endif // _CCCL_CTK_AT_LEAST(13, 0)
+
   test<unsigned long, 1, ulong1>();
   test<unsigned long, 2, ulong2>();
   test<unsigned long, 3, ulong3>();
@@ -72,6 +112,16 @@ __host__ __device__ void test()
   test<unsigned long, 4, ulong4>();
 #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
+  static_assert(cuda::__is_vector_type_v<ulong1>);
+  static_assert(cuda::__is_vector_type_v<ulong2>);
+  static_assert(cuda::__is_vector_type_v<ulong3>);
+#if _CCCL_CTK_AT_LEAST(13, 0)
+  static_assert(cuda::__is_vector_type_v<ulong4_16a>);
+  static_assert(cuda::__is_vector_type_v<ulong4_32a>);
+#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+  static_assert(cuda::__is_vector_type_v<ulong4>);
+#endif // _CCCL_CTK_AT_LEAST(13, 0)
+
   test<signed long long, 1, longlong1>();
   test<signed long long, 2, longlong2>();
   test<signed long long, 3, longlong3>();
@@ -81,6 +131,16 @@ __host__ __device__ void test()
   test<signed long long, 4, longlong4>();
 #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
+  static_assert(cuda::__is_vector_type_v<longlong1>);
+  static_assert(cuda::__is_vector_type_v<longlong2>);
+  static_assert(cuda::__is_vector_type_v<longlong3>);
+#if _CCCL_CTK_AT_LEAST(13, 0)
+  static_assert(cuda::__is_vector_type_v<longlong4_16a>);
+  static_assert(cuda::__is_vector_type_v<longlong4_32a>);
+#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+  static_assert(cuda::__is_vector_type_v<longlong4>);
+#endif // _CCCL_CTK_AT_LEAST(13, 0)
+
   test<unsigned long long, 1, ulonglong1>();
   test<unsigned long long, 2, ulonglong2>();
   test<unsigned long long, 3, ulonglong3>();
@@ -90,11 +150,26 @@ __host__ __device__ void test()
   test<unsigned long long, 4, ulonglong4>();
 #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
+  static_assert(cuda::__is_vector_type_v<ulonglong1>);
+  static_assert(cuda::__is_vector_type_v<ulonglong2>);
+  static_assert(cuda::__is_vector_type_v<ulonglong3>);
+#if _CCCL_CTK_AT_LEAST(13, 0)
+  static_assert(cuda::__is_vector_type_v<ulonglong4_16a>);
+  static_assert(cuda::__is_vector_type_v<ulonglong4_32a>);
+#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+  static_assert(cuda::__is_vector_type_v<ulonglong4>);
+#endif // _CCCL_CTK_AT_LEAST(13, 0)
+
   test<float, 1, float1>();
   test<float, 2, float2>();
   test<float, 3, float3>();
   test<float, 4, float4>();
 
+  static_assert(cuda::__is_vector_type_v<float1>);
+  static_assert(cuda::__is_vector_type_v<float2>);
+  static_assert(cuda::__is_vector_type_v<float3>);
+  static_assert(cuda::__is_vector_type_v<float4>);
+
   test<double, 1, double1>();
   test<double, 2, double2>();
   test<double, 3, double3>();
@@ -104,11 +179,26 @@ __host__ __device__ void test()
   test<double, 4, double4>();
 #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
+  static_assert(cuda::__is_vector_type_v<double1>);
+  static_assert(cuda::__is_vector_type_v<double2>);
+  static_assert(cuda::__is_vector_type_v<double3>);
+#if _CCCL_CTK_AT_LEAST(13, 0)
+  static_assert(cuda::__is_vector_type_v<double4_16a>);
+  static_assert(cuda::__is_vector_type_v<double4_32a>);
+#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+  static_assert(cuda::__is_vector_type_v<double4>);
+#endif // _CCCL_CTK_AT_LEAST(13, 0)
+
+  static_assert(cuda::__is_vector_type_v<dim3>);
+
   // 2. Test invalid combinations
 
   test<float, 0, void>();
   test<char, 1, void>();
   test<long, 5, void>();
+
+  static_assert(!cuda::__is_vector_type_v<int>);
+  static_assert(!cuda::__is_vector_type_v<void>);
 }
 
 int main(int, char**)