Ensure that cuda_memory_resource allocates memory on the proper dev…

…ice (#2073) * Ensure that `cuda_memory_resource` allocates memory on the proper device * Move `__ensure_current_device` to own header
NVIDIA · Aug 1, 2024 · 39b926a · 39b926a
1 parent 2600135
commit 39b926a
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 38 deletions.
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
@@ -53,6 +53,7 @@
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
 
+#include <cuda/std/__cuda/ensure_current_device.h>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
@@ -105,36 +106,11 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-/**
- * \brief RAII helper which saves the current device and switches to the
- *        specified device on construction and switches to the saved device on
- *        destruction.
- */
-struct SwitchDevice
-{
-private:
-  int const old_device;
-  bool const needs_reset;
 
-public:
-  _CCCL_HOST inline SwitchDevice(int new_device)
-      : old_device(CurrentDevice())
-      , needs_reset(old_device != new_device)
-  {
-    if (needs_reset)
-    {
-      CubDebug(cudaSetDevice(new_device));
-    }
-  }
+//! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
+//! to the saved device on destruction.
+using SwitchDevice = ::cuda::__ensure_current_device;
 
-  _CCCL_HOST inline ~SwitchDevice()
-  {
-    if (needs_reset)
-    {
-      CubDebug(cudaSetDevice(old_device));
-    }
-  }
-};
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
 /**

diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
@@ -32,15 +32,30 @@
 #  include <cuda/__memory_resource/resource.h>
 #  include <cuda/__memory_resource/resource_ref.h>
 #  include <cuda/std/__cuda/api_wrapper.h>
+#  include <cuda/std/__cuda/ensure_current_device.h>
 #  include <cuda/std/__new/bad_alloc.h>
 
 #  if _CCCL_STD_VER >= 2014
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
 //! @brief cuda_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation.
-struct cuda_memory_resource
+//! By default uses device 0 to allocate memory
+class cuda_memory_resource
 {
+private:
+  int __device_id_{0};
+
+public:
+  //! @brief default constructs a cuda_memory_resource allocating memory on device 0
+  cuda_memory_resource() = default;
+
+  //! @brief default constructs a cuda_memory_resource allocating memory on device \p __device_id
+  //! @param __device_id The id of the device we are allocating memory on
+  constexpr cuda_memory_resource(const int __device_id) noexcept
+      : __device_id_(__device_id)
+  {}
+
   //! @brief Allocate device memory of size at least \p __bytes.
   //! @param __bytes The size in bytes of the allocation.
   //! @param __alignment The requested alignment of the allocation.
@@ -54,6 +69,9 @@ struct cuda_memory_resource
       _CUDA_VSTD::__throw_bad_alloc();
     }
 
+    // We need to ensure that we allocate on the right device as `cudaMalloc` always uses the current device
+    __ensure_current_device __device_wrapper{__device_id_};
+
     void* __ptr{nullptr};
     _CCCL_TRY_CUDA_API(::cudaMalloc, "Failed to allocate memory with cudaMalloc.", &__ptr, __bytes);
     return __ptr;
@@ -73,17 +91,19 @@ struct cuda_memory_resource
   }
 
   //! @brief Equality comparison with another \c cuda_memory_resource
-  //! @return true
-  _CCCL_NODISCARD constexpr bool operator==(cuda_memory_resource const&) const noexcept
+  //! @param __other The other \c cuda_memory_resource
+  //! @return true, if both resources hold the same device id
+  _CCCL_NODISCARD constexpr bool operator==(cuda_memory_resource const& __other) const noexcept
   {
-    return true;
+    return __device_id_ == __other.__device_id_;
   }
 #    if _CCCL_STD_VER <= 2017
   //! @brief Inequality comparison with another \c cuda_memory_resource
-  //! @return false
-  _CCCL_NODISCARD constexpr bool operator!=(cuda_memory_resource const&) const noexcept
+  //! @param __other The other \c cuda_memory_resource
+  //! @return true, if both resources hold different device id's
+  _CCCL_NODISCARD constexpr bool operator!=(cuda_memory_resource const& __other) const noexcept
   {
-    return false;
+    return __device_id_ != __other.__device_id_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 

diff --git a/libcudacxx/include/cuda/std/__cuda/ensure_current_device.h b/libcudacxx/include/cuda/std/__cuda/ensure_current_device.h
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA__STD__CUDA_ENSURE_CURRENT_DEVICE_H
+#define _CUDA__STD__CUDA_ENSURE_CURRENT_DEVICE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_CUDA_COMPILER_NVHPC)
+#  include <cuda_runtime_api.h>
+#endif // !_CCCL_CUDA_COMPILER_NVCC && !_CCCL_CUDA_COMPILER_NVHPC
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/__exception/cuda_error.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief `__ensure_current_device` is a simple helper that the current device is set to the right one.
+//! Only changes the current device if the target device is not the current one
+struct __ensure_current_device
+{
+  int __target_device_   = 0;
+  int __original_device_ = 0;
+
+  //! @brief Querries the current device and if that is different than \p __target_device sets the current device to
+  //! \p __target_device
+  __ensure_current_device(const int __target_device)
+      : __target_device_(__target_device)
+  {
+    _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device", &__original_device_);
+    if (__original_device_ != __target_device_)
+    {
+      _CCCL_TRY_CUDA_API(::cudaSetDevice, "Failed to set device", __target_device_);
+    }
+  }
+
+  //! @brief If the \p __original_device was not equal to \p __target_device sets the current device back to
+  //! \p __original_device
+  ~__ensure_current_device()
+  {
+    if (__original_device_ != __target_device_)
+    {
+      _CCCL_TRY_CUDA_API(::cudaSetDevice, "Failed to set device", __original_device_);
+    }
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif //_CUDA__STD__CUDA_ENSURE_CURRENT_DEVICE_H
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/cuda_memory_resource/traits.pass.cpp
@@ -16,14 +16,14 @@
 #include <cuda/std/type_traits>
 
 using resource = cuda::mr::cuda_memory_resource;
-static_assert(cuda::std::is_trivial<resource>::value, "");
-static_assert(cuda::std::is_trivially_default_constructible<resource>::value, "");
+static_assert(!cuda::std::is_trivial<resource>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_move_constructible<resource>::value, "");
 static_assert(cuda::std::is_trivially_copy_assignable<resource>::value, "");
 static_assert(cuda::std::is_trivially_move_assignable<resource>::value, "");
 static_assert(cuda::std::is_trivially_destructible<resource>::value, "");
-static_assert(cuda::std::is_empty<resource>::value, "");
+static_assert(!cuda::std::is_empty<resource>::value, "");
 
 int main(int, char**)
 {