Fix stream_ordered_memory_resource attempt to record event in stream …

…from another device (#1333) As described in #1306 , stream_ordered_memory_resource maintained thread-local storage of an event to synchronize in the per-thread default stream. However, this caused the event to be recorded on the wrong stream if allocations were made on memory resources associated with different devices, because allocation on the first device on the PTDS would initialize the TLS for that stream, and subsequent device pools would try to use the already initialized TLS. This PR adds a new test that only runs on multidevice systems (more correctly, does nothing on single device systems). The test creates two per-device pools, and creates and destroys a device buffer on each. It also fixes `stream_ordered_memory_resource` to store the ID of the device that is current when it is constructed, and then to store a vector of one event per device in TLS rather than a single event. When the PTDS is passed to `get_event`, the event for the MR's stored device ID is used. This should correctly support PTDS with multiple threads and multiple devices (still one MR per device). The PR also includes some changes to the device ID utilities in `cuda_device.hpp`. There is a new RAII device helper class, and a `get_num_cuda_devices()` function. Finally, there is a small addition to the .clang-tidy to disable warnings about `do...while` loops inside of RMM error checking macros. - Fixes #1306 Authors: - Mark Harris (https://github.com/harrism) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) - Jake Hemstad (https://github.com/jrhemstad) URL: #1333
rapidsai · Sep 13, 2023 · 2ff1bd0 · 2ff1bd0
1 parent ebd423c
commit 2ff1bd0
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 30 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -64,4 +64,6 @@ CheckOptions:
     value:           '1'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
     value:           '1'
+  - key:             cppcoreguidelines-avoid-do-while.IgnoreMacros
+    value:           'true'
 ...
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ DartConfiguration.tcl
 .DS_Store
 *.manifest
 *.spec
+compile_commands.json
 
 ## Python build directories & artifacts
 dist/

diff --git a/README.md b/README.md
@@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device
 ```c++
 std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
 for(int i = 0; i < N; ++i) {
-    cudaSetDevice(i); // set device i before creating MR
-    // Use a vector of unique_ptr to maintain the lifetime of the MRs
-    per_device_pools.push_back(std::make_unique<pool_memory_resource>());
-    // Set the per-device resource for device i
-    set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+  cudaSetDevice(i); // set device i before creating MR
+  // Use a vector of unique_ptr to maintain the lifetime of the MRs
+  per_device_pools.push_back(std::make_unique<pool_memory_resource>());
+  // Set the per-device resource for device i
+  set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+}
+```
+
+Note that the CUDA device that is current when creating a `device_memory_resource` must also be
+current any time that `device_memory_resource` is used to deallocate memory, including in a
+destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
+(incorrect) example that assumes the above example loop has been run to create a
+`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
+line of the error comment.
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  rmm::device_buffer buf_a(16);
+
+  {
+    RMM_CUDA_TRY(cudaSetDevice(1));
+    rmm::device_buffer buf_b(16);
+  }
+
+  // Error: when buf_a is destroyed, the current device must be 0, but it is 1
 }
 ```
 

diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
@@ -42,19 +42,63 @@ struct cuda_device_id {
   value_type id_;
 };
 
-namespace detail {
 /**
  * @brief Returns a `cuda_device_id` for the current device
  *
  * The current device is the device on which the calling thread executes device code.
  *
  * @return `cuda_device_id` for the current device
  */
-inline cuda_device_id current_device()
+inline cuda_device_id get_current_cuda_device()
 {
-  int dev_id{};
-  RMM_CUDA_TRY(cudaGetDevice(&dev_id));
+  cuda_device_id::value_type dev_id{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
   return cuda_device_id{dev_id};
 }
-}  // namespace detail
+
+/**
+ * @brief Returns the number of CUDA devices in the system
+ *
+ * @return Number of CUDA devices in the system
+ */
+inline int get_num_cuda_devices()
+{
+  cuda_device_id::value_type num_dev{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+/**
+ * @brief RAII class that sets the current CUDA device to the specified device on construction
+ * and restores the previous device on destruction.
+ */
+struct cuda_set_device_raii {
+  /**
+   * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id`
+   *
+   * @param dev_id The device to set as the current CUDA device
+   */
+  explicit cuda_set_device_raii(cuda_device_id dev_id)
+    : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+  }
+  /**
+   * @brief Reactivates the previous CUDA device
+   */
+  ~cuda_set_device_raii() noexcept
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
+  }
+
+  cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
+  cuda_set_device_raii(cuda_set_device_raii&&)                 = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii&&)      = delete;
+
+ private:
+  cuda_device_id old_device_;
+  bool needs_reset_;
+};
+
 }  // namespace rmm
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -115,7 +115,7 @@ struct async_alloc {
       int cuda_pool_supported{};
       auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
                                            cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
+                                           rmm::get_current_cuda_device().value());
       return result == cudaSuccess and cuda_pool_supported == 1;
     }()};
     return runtime_supports_pool and driver_supports_pool;
@@ -139,7 +139,7 @@ struct async_alloc {
     if (cudaMemHandleTypeNone != handle_type) {
       auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
                                                  cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::detail::current_device().value());
+                                                 rmm::get_current_cuda_device().value());
 
       // Don't throw on cudaErrorInvalidValue
       auto const unsupported_runtime = (result == cudaErrorInvalidValue);

diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
                 "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
-    pool_props.location.id   = rmm::detail::current_device().value();
+    pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
       }()}
   {
     // Check if cudaMallocAsync Memory pool supported
-    auto const device = rmm::detail::current_device();
+    auto const device = rmm::get_current_cuda_device();
     int cuda_pool_supported{};
     auto result =
       cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value());

diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -15,15 +15,16 @@
  */
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
-
 #include <cuda_runtime_api.h>
 
+#include <fmt/core.h>
+
 #include <cstddef>
 #include <functional>
 #include <limits>
@@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   stream_event_pair get_event(cuda_stream_view stream)
   {
     if (stream.is_per_thread_default()) {
-      // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR
-      // instance ensures it is destroyed cleaned up only after all are finished with it.
-      thread_local auto event_tls = std::make_shared<event_wrapper>();
-      default_stream_events.insert(event_tls);
-      return stream_event_pair{stream.value(), event_tls->event};
+      // Create a thread-local shared event wrapper for each device. Shared pointers in the thread
+      // and in each MR instance ensure the wrappers are destroyed only after all are finished
+      // with them.
+      thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
+        rmm::get_num_cuda_devices());
+      auto event = [&, device_id = this->device_id_]() {
+        if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
+
+        auto event = std::make_shared<event_wrapper>();
+        this->default_stream_events.insert(event);
+        return (events_tls[device_id.value()] = std::move(event))->event;
+      }();
+      return stream_event_pair{stream.value(), event};
     }
     // We use cudaStreamLegacy as the event map key for the default stream for consistency between
     // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
     // user explicitly passes it, so it is used as the default location for the free list
-    // at construction. For consistency, the same key is used for null stream free lists in non-PTDS
-    // mode.
+    // at construction. For consistency, the same key is used for null stream free lists in
+    // non-PTDS mode.
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
     auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value();
 
@@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   // bidirectional mapping between non-default streams and events
   std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
 
-  // shared pointers to events keeps the events alive as long as either the thread that created them
-  // or the MR that is using them exists.
+  // shared pointers to events keeps the events alive as long as either the thread that created
+  // them or the MR that is using them exists.
   std::set<std::shared_ptr<event_wrapper>> default_stream_events;
 
   std::mutex mtx_;  // mutex for thread-safe access
-};                  // namespace detail
+
+  rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};
+};  // namespace detail
 
 }  // namespace rmm::mr::detail
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
@@ -202,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
  */
 inline device_memory_resource* get_current_device_resource()
 {
-  return get_per_device_resource(rmm::detail::current_device());
+  return get_per_device_resource(rmm::get_current_cuda_device());
 }
 
 /**
@@ -231,6 +231,6 @@ inline device_memory_resource* get_current_device_resource()
  */
 inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)
 {
-  return set_per_device_resource(rmm::detail::current_device(), new_mr);
+  return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
 }  // namespace rmm::mr
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -31,7 +31,7 @@ TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
   RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::detail::current_device().value()));
+    &memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 {
   cudaMemPoolProps poolProps = {};
   poolProps.allocType        = cudaMemAllocationTypePinned;
-  poolProps.location.id      = rmm::detail::current_device().value();
+  poolProps.location.id      = rmm::get_current_cuda_device().value();
   poolProps.location.type    = cudaMemLocationTypeDevice;
 
   cudaMemPool_t memPool{};

diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
@@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo)
   mr2.deallocate(ptr, 1024);
 }
 
+TEST(PoolTest, MultidevicePool)
+{
+  using MemoryResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::mr::cuda_memory_resource general_mr;
+
+    // initializing pool_memory_resource of multiple devices
+    int devices      = 2;
+    size_t pool_size = 1024;
+    std::vector<std::shared_ptr<MemoryResource>> mrs;
+
+    for (int i = 0; i < devices; ++i) {
+      RMM_CUDA_TRY(cudaSetDevice(i));
+      auto mr = std::make_shared<MemoryResource>(&general_mr, pool_size, pool_size);
+      rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get());
+      mrs.emplace_back(mr);
+    }
+
+    {
+      RMM_CUDA_TRY(cudaSetDevice(0));
+      rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get());
+
+      {
+        RMM_CUDA_TRY(cudaSetDevice(1));
+        rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get());
+      }
+
+      RMM_CUDA_TRY(cudaSetDevice(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace rmm::test