Refactor rmm::device_scalar in terms of rmm::device_uvector (#789)

This PR refactors `device_scalar` to use a single-element `device_uvector` for its storage. This simplifies the implementation of device_scalar. Also changes the API of `device_scalar` so that its asynchronous / stream-ordered methods use the same API style (with explicit stream parameter) as `device_uvector` and `device_buffer`. Closes #570 This is a breaking change. When it is merged, PRs are likely to need to be merged immediately in other libraries to account for the API changes. - [x] cuDF: rapidsai/cudf#8411 - [x] cuGraph: rapidsai/cugraph#1637 - [x] RAFT: rapidsai/raft#259 - [x] ~cuML~ (unused) - [x] ~cuSpatial~ (unused) Authors: - Mark Harris (https://github.com/harrism) Approvers: - Rong Ou (https://github.com/rongou) - Jake Hemstad (https://github.com/jrhemstad) URL: #789
rapidsai · Jun 8, 2021 · aa2a2f3 · aa2a2f3
1 parent e2832e3
commit aa2a2f3
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 185 deletions.
diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -37,6 +37,35 @@ class device_scalar {
  public:
   static_assert(std::is_trivially_copyable<T>::value, "Scalar type must be trivially copyable");
 
+  using value_type      = typename device_uvector<T>::value_type;
+  using reference       = typename device_uvector<T>::reference;
+  using const_reference = typename device_uvector<T>::const_reference;
+  using pointer         = typename device_uvector<T>::pointer;
+  using const_pointer   = typename device_uvector<T>::const_pointer;
+
+  RMM_EXEC_CHECK_DISABLE
+  ~device_scalar() = default;
+
+  RMM_EXEC_CHECK_DISABLE
+  device_scalar(device_scalar &&) = default;
+
+  device_scalar &operator=(device_scalar &&) = default;
+
+  /**
+   * @brief Copy ctor is deleted as it doesn't allow a stream argument
+   */
+  device_scalar(device_scalar const &) = delete;
+
+  /**
+   * @brief Copy assignment is deleted as it doesn't allow a stream argument
+   */
+  device_scalar &operator=(device_scalar const &) = delete;
+
+  /**
+   * @brief Default constructor is deleted as it doesn't allow a stream argument
+   */
+  device_scalar() = delete;
+
   /**
    * @brief Construct a new uninitialized `device_scalar`.
    *
@@ -54,7 +83,7 @@ class device_scalar {
   explicit device_scalar(
     cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-    : buffer{sizeof(T), stream, mr}
+    : _storage{1, stream, mr}
   {
   }
 
@@ -75,12 +104,12 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(
-    T const &initial_value,
-    cuda_stream_view stream             = cuda_stream_view{},
+    value_type const &initial_value,
+    cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-    : buffer{sizeof(T), stream, mr}
+    : _storage{1, stream, mr}
   {
-    set_value(initial_value, stream);
+    set_value_async(initial_value, stream);
   }
 
   /**
@@ -96,9 +125,9 @@ class device_scalar {
    * @param mr The resource to use for allocating the new `device_scalar`
    */
   device_scalar(device_scalar const &other,
-                cuda_stream_view stream             = {},
+                cuda_stream_view stream,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-    : buffer{other.buffer, stream, mr}
+    : _storage{other._storage, stream, mr}
   {
   }
 
@@ -118,31 +147,25 @@ class device_scalar {
    * @return T The value of the scalar.
    * @param stream CUDA stream on which to perform the copy and synchronize.
    */
-  T value(cuda_stream_view stream = cuda_stream_view{}) const
-  {
-    T host_value{};
-    _memcpy(&host_value, buffer.data(), stream);
-    stream.synchronize();
-    return host_value;
-  }
+  value_type value(cuda_stream_view stream) const { return _storage.front_element(stream); }
 
   /**
-   * @brief Sets the value of the `device_scalar` to the given `host_value`.
+   * @brief Sets the value of the `device_scalar` to the value of `v`.
    *
    * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
-   * `host_value` is zero.
+   * `v` is zero.
    *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
    * this function, otherwise there may be a race condition.
    *
    * This function does not synchronize `stream` before returning. Therefore, the object
-   * referenced by `host_value` should not be destroyed or modified until `stream` has been
+   * referenced by `v` should not be destroyed or modified until `stream` has been
    * synchronized. Otherwise, behavior is undefined.
    *
    * @note: This function incurs a host to device memcpy or device memset and should be used
-   * sparingly.
+   * carefully.
    *
    * Example:
    * \code{cpp}
@@ -151,7 +174,7 @@ class device_scalar {
    * int v{42};
    *
    * // Copies 42 to device storage on `stream`. Does _not_ synchronize
-   * vec.set_value(v, stream);
+   * vec.set_value_async(v, stream);
    * ...
    * cudaStreamSynchronize(stream);
    * // Synchronization is required before `v` can be modified
@@ -160,112 +183,20 @@ class device_scalar {
    *
    * @throws `rmm::cuda_error` if copying `host_value` to device memory fails.
    *
-   * @param host_value The host value which will be copied to device
+   * @param v The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  template <typename U = T>
-  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_fundamental<U>::value && not std::is_same<U, bool>::value, void>
+  void set_value_async(value_type const &v, cuda_stream_view s)
   {
-    if (host_value == U{0}) {
-      set_value_zero(stream);
-    } else {
-      _memcpy(buffer.data(), &host_value, stream);
-    }
-  }
-
-  /**
-   * @brief Sets the value of the `device_scalar` to the given `host_value`.
-   *
-   * This specialization for `bool` is optimized to always use `cudaMemsetAsync`.
-   *
-   * @note If the stream specified to this function is different from the stream specified
-   * to the constructor, then appropriate dependencies must be inserted between the streams
-   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
-   * this function, otherwise there may be a race condition.
-   *
-   * This function does not synchronize `stream` before returning. `host_value` is passed by value
-   * so a host-side copy may be performed before calling a device memset.
-   *
-   * @note: This function incurs a device memset.
-   *
-   * Example:
-   * \code{cpp}
-   * rmm::device_scalar<bool> s;
-   *
-   * bool v{true};
-   *
-   * // Copies `true` to device storage on `stream`. Does _not_ synchronize
-   * vec.set_value(v, stream);
-   * ...
-   * cudaStreamSynchronize(stream);
-   * // Synchronization is required before `v` can be modified
-   * v = false;
-   * \endcode
-   *
-   * @throws `rmm::cuda_error` if the device memset fails.
-   *
-   * @param host_value The host value which the scalar will be set to (true or false)
-   * @param stream CUDA stream on which to perform the device memset
-   */
-  template <typename U = T>
-  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_same<U, bool>::value, void>
-  {
-    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value()));
-  }
-
-  /**
-   * @brief Sets the value of the `device_scalar` to the given `host_value`.
-   *
-   * Specialization for non-fundamental types.
-   *
-   * @note If the stream specified to this function is different from the stream specified
-   * to the constructor, then appropriate dependencies must be inserted between the streams
-   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
-   * this function, otherwise there may be a race condition.
-   *
-   * This function does not synchronize `stream` before returning. Therefore, the object
-   * referenced by `host_value` should not be destroyed or modified until `stream` has been
-   * synchronized. Otherwise, behavior is undefined.
-   *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
-
-   * Example:
-   * \code{cpp}
-   * rmm::device_scalar<my_type> s;
-   *
-   * my_type v{42, "text"};
-   *
-   * // Copies 42 to device storage on `stream`. Does _not_ synchronize
-   * vec.set_value(v, stream);
-   * ...
-   * cudaStreamSynchronize(stream);
-   * // Synchronization is required before `v` can be modified
-   * v.value = 21;
-   * \endcode
-   *
-   * @throws `rmm::cuda_error` if copying `host_value` to device memory fails
-   * @throws `rmm::cuda_error` if synchronizing `stream` fails
-   *
-   * @param host_value The host value which will be copied to device
-   * @param stream CUDA stream on which to perform the copy
-   */
-  template <typename U = T>
-  auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<not std::is_fundamental<U>::value, void>
-  {
-    _memcpy(buffer.data(), &host_value, stream);
+    _storage.set_element_async(0, v, s);
   }
 
   // Disallow passing literals to set_value to avoid race conditions where the memory holding the
   // literal can be freed before the async memcpy / memset executes.
-  void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
+  void set_value_async(value_type &&, cuda_stream_view) = delete;
 
   /**
-   * @brief Sets the value of the `device_scalar` to zero.
-   *
-   * Only supported for fundamental types.
+   * @brief Sets the value of the `device_scalar` to zero on the specified stream.
    *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
@@ -274,17 +205,13 @@ class device_scalar {
    *
    * This function does not synchronize `stream` before returning.
    *
-   * @note: This function incurs a device memset and should be used sparingly.
-   *
-   * @throws `rmm::cuda_error` if the device memset fails.
+   * @note: This function incurs a device memset and should be used carefully.
    *
-   * @param stream CUDA stream on which to perform the device memset
+   * @param stream CUDA stream on which to perform the copy
    */
-  template <typename U = T>
-  auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_fundamental<U>::value, void>
+  void set_value_to_zero_async(cuda_stream_view s)
   {
-    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value()));
+    _storage.set_element_to_zero_async(value_type{0}, s);
   }
 
   /**
@@ -295,7 +222,7 @@ class device_scalar {
    * streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may
    * be a race condition.
    */
-  T *data() noexcept { return static_cast<T *>(buffer.data()); }
+  pointer data() noexcept { return static_cast<pointer>(_storage.data()); }
 
   /**
    * @brief Returns const pointer to object in device memory.
@@ -305,20 +232,9 @@ class device_scalar {
    * streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may
    * be a race condition.
    */
-  T const *data() const noexcept { return static_cast<T const *>(buffer.data()); }
-
-  device_scalar()                 = default;
-  ~device_scalar()                = default;
-  device_scalar(device_scalar &&) = default;
-  device_scalar &operator=(device_scalar const &) = delete;
-  device_scalar &operator=(device_scalar &&) = delete;
+  const_pointer data() const noexcept { return static_cast<const_pointer>(_storage.data()); }
 
  private:
-  rmm::device_buffer buffer{sizeof(T), cuda_stream_default};
-
-  inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const
-  {
-    RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value()));
-  }
+  rmm::device_uvector<T> _storage;
 };
 }  // namespace rmm