rapidsai · rapids-bot · Sep 13, 2023 · Aug 31, 2023 · Aug 31, 2023 · Aug 31, 2023
diff --git a/.clang-tidy b/.clang-tidy
@@ -64,4 +64,6 @@ CheckOptions:
     value:           '1'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
     value:           '1'
+  - key:             cppcoreguidelines-avoid-do-while.IgnoreMacros
+    value:           'true'
 ...
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ DartConfiguration.tcl
 .DS_Store
 *.manifest
 *.spec
+compile_commands.json
 
 ## Python build directories & artifacts
 dist/

diff --git a/README.md b/README.md
@@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device
 ```c++
 std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
 for(int i = 0; i < N; ++i) {
-    cudaSetDevice(i); // set device i before creating MR
-    // Use a vector of unique_ptr to maintain the lifetime of the MRs
-    per_device_pools.push_back(std::make_unique<pool_memory_resource>());
-    // Set the per-device resource for device i
-    set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+  cudaSetDevice(i); // set device i before creating MR
+  // Use a vector of unique_ptr to maintain the lifetime of the MRs
+  per_device_pools.push_back(std::make_unique<pool_memory_resource>());
+  // Set the per-device resource for device i
+  set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+}
+```
+
+Note that the CUDA device that is current when creating a `device_memory_resource` must also be
+current any time that `device_memory_resource` is used to deallocate memory, including in a
+destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
+(incorrect) example that assumes the above example loop has been run to create a
+`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
+line of the error comment.
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  rmm::device_buffer buf_a(16);
+
+  {
+    RMM_CUDA_TRY(cudaSetDevice(1));
+    rmm::device_buffer buf_b(16);
+  }
+
+  // Error: when buf_a is destroyed, the current device must be 0, but it is 1
 }
 ```
 

@@ -42,19 +42,63 @@ struct cuda_device_id {
   value_type id_;
 };
 
-namespace detail {
 /**
  * @brief Returns a `cuda_device_id` for the current device
  *
  * The current device is the device on which the calling thread executes device code.
  *
  * @return `cuda_device_id` for the current device
  */
-inline cuda_device_id current_device()
+inline cuda_device_id get_current_cuda_device()
 {
-  int dev_id{};
-  RMM_CUDA_TRY(cudaGetDevice(&dev_id));
+  cuda_device_id::value_type dev_id{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
   return cuda_device_id{dev_id};
 }
-}  // namespace detail
+
+/**
+ * @brief Returns the number of CUDA devices in the system
+ *
+ * @return Number of CUDA devices in the system
+ */
+inline int get_num_cuda_devices()
+{
+  cuda_device_id::value_type num_dev{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+/**
+ * @brief RAII class that sets the current CUDA device to the specified device on construction
+ * and restores the previous device on destruction.
+ */
+struct cuda_set_device_raii {
+  /**
+   * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id`
+   *
+   * @param dev_id The device to set as the current CUDA device
+   */
+  explicit cuda_set_device_raii(cuda_device_id dev_id)
+    : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+  }
+  /**
+   * @brief Reactivates the previous CUDA device
+   */
+  ~cuda_set_device_raii() noexcept
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
+  }
+
+  cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
+  cuda_set_device_raii(cuda_set_device_raii&&)                 = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii&&)      = delete;
+
+ private:
+  cuda_device_id old_device_;
+  bool needs_reset_;
+};
+
 }  // namespace rmm
@@ -115,7 +115,7 @@ struct async_alloc {
       int cuda_pool_supported{};
       auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
                                            cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
+                                           rmm::get_current_cuda_device().value());
       return result == cudaSuccess and cuda_pool_supported == 1;
     }()};
     return runtime_supports_pool and driver_supports_pool;
@@ -139,7 +139,7 @@ struct async_alloc {
     if (cudaMemHandleTypeNone != handle_type) {
       auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
                                                  cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::detail::current_device().value());
+                                                 rmm::get_current_cuda_device().value());
 
       // Don't throw on cudaErrorInvalidValue
       auto const unsupported_runtime = (result == cudaErrorInvalidValue);

@@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
                 "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
-    pool_props.location.id   = rmm::detail::current_device().value();
+    pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};

@@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
       }()}
   {
     // Check if cudaMallocAsync Memory pool supported
-    auto const device = rmm::detail::current_device();
+    auto const device = rmm::get_current_cuda_device();
     int cuda_pool_supported{};
     auto result =
       cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value());

@@ -15,15 +15,16 @@
  */
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
-
 #include <cuda_runtime_api.h>
 
+#include <fmt/core.h>
+
 #include <cstddef>
 #include <functional>
 #include <limits>
@@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   stream_event_pair get_event(cuda_stream_view stream)
   {
     if (stream.is_per_thread_default()) {
-      // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR
-      // instance ensures it is destroyed cleaned up only after all are finished with it.
-      thread_local auto event_tls = std::make_shared<event_wrapper>();
-      default_stream_events.insert(event_tls);
-      return stream_event_pair{stream.value(), event_tls->event};
+      // Create a thread-local shared event wrapper for each device. Shared pointers in the thread
+      // and in each MR instance ensure the wrappers are destroyed only after all are finished
+      // with them.
+      thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
+        rmm::get_num_cuda_devices());
+      auto event = [&, device_id = this->device_id_]() {
+        if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
+
+        auto event = std::make_shared<event_wrapper>();
+        this->default_stream_events.insert(event);
+        return (events_tls[device_id.value()] = std::move(event))->event;
+      }();
+      return stream_event_pair{stream.value(), event};
     }
     // We use cudaStreamLegacy as the event map key for the default stream for consistency between
     // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
     // user explicitly passes it, so it is used as the default location for the free list
-    // at construction. For consistency, the same key is used for null stream free lists in non-PTDS
-    // mode.
+    // at construction. For consistency, the same key is used for null stream free lists in
+    // non-PTDS mode.
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
     auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value();
 
@@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   // bidirectional mapping between non-default streams and events
   std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
 
-  // shared pointers to events keeps the events alive as long as either the thread that created them
-  // or the MR that is using them exists.
+  // shared pointers to events keeps the events alive as long as either the thread that created
+  // them or the MR that is using them exists.
   std::set<std::shared_ptr<event_wrapper>> default_stream_events;
 
   std::mutex mtx_;  // mutex for thread-safe access
-};                  // namespace detail
+
+  rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};
+};  // namespace detail
 
 }  // namespace rmm::mr::detail
@@ -202,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
  */
 inline device_memory_resource* get_current_device_resource()
 {
-  return get_per_device_resource(rmm::detail::current_device());
+  return get_per_device_resource(rmm::get_current_cuda_device());
 }
 
 /**
@@ -231,6 +231,6 @@ inline device_memory_resource* get_current_device_resource()
  */
 inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)
 {
-  return set_per_device_resource(rmm::detail::current_device(), new_mr);
+  return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
 }  // namespace rmm::mr
@@ -31,7 +31,7 @@ TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
   RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::detail::current_device().value()));
+    &memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 {
   cudaMemPoolProps poolProps = {};
   poolProps.allocType        = cudaMemAllocationTypePinned;
-  poolProps.location.id      = rmm::detail::current_device().value();
+  poolProps.location.id      = rmm::get_current_cuda_device().value();
   poolProps.location.type    = cudaMemLocationTypeDevice;
 
   cudaMemPool_t memPool{};

@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
@@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo)
   mr2.deallocate(ptr, 1024);
 }
 
+TEST(PoolTest, MultidevicePool)
+{
+  using MemoryResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::mr::cuda_memory_resource general_mr;
+
+    // initializing pool_memory_resource of multiple devices
+    int devices      = 2;
+    size_t pool_size = 1024;
+    std::vector<std::shared_ptr<MemoryResource>> mrs;
+
+    for (int i = 0; i < devices; ++i) {
+      RMM_CUDA_TRY(cudaSetDevice(i));
+      auto mr = std::make_shared<MemoryResource>(&general_mr, pool_size, pool_size);
+      rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get());
+      mrs.emplace_back(mr);
+    }
+
+    {
+      RMM_CUDA_TRY(cudaSetDevice(0));
+      rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get());
+
+      {
+        RMM_CUDA_TRY(cudaSetDevice(1));
+        rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get());
+      }
+
+      RMM_CUDA_TRY(cudaSetDevice(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace rmm::test