rapidsai · rapids-bot · Jun 6, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
@@ -21,6 +21,7 @@
 #include <cstddef>
 #include <mutex>
 #include <shared_mutex>
+#include <stack>
 
 namespace rmm::mr {
 /**
@@ -36,20 +37,27 @@ namespace rmm::mr {
  * resource in order to satisfy allocation requests, but any existing
  * allocations will be untracked. Tracking statistics stores the current, peak
  * and total memory allocations for both the number of bytes and number of calls
- * to the memory resource. `statistics_resource_adaptor` is intended as a debug
- * adaptor and shouldn't be used in performance-sensitive code.
+ * to the memory resource.
+ *
+ * This resource supports nested statistics, which makes it possible to track statistics
+ * of a code block. Use `.push_counters()` to start tracking statistics on a code block
+ * and use `.pop_counters()` to stop the tracking. The nested statistics are cascading
+ * such that the statistics tracked by a code block include the statistics tracked in
+ * all its tracked sub code blocks.
+ *
+ * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be
+ * used in performance-sensitive code.
  *
  * @tparam Upstream Type of the upstream resource used for
  * allocation/deallocation.
  */
 template <typename Upstream>
 class statistics_resource_adaptor final : public device_memory_resource {
  public:
- // can be a std::shared_mutex once C++17 is adopted
  using read_lock_t =
- std::shared_lock<std::shared_timed_mutex>; ///< Type of lock used to synchronize read access
+ std::shared_lock<std::shared_mutex>; ///< Type of lock used to synchronize read access
  using write_lock_t =
- std::unique_lock<std::shared_timed_mutex>; ///< Type of lock used to synchronize write access
+ std::unique_lock<std::shared_mutex>; ///< Type of lock used to synchronize write access
  /**
  * @brief Utility struct for counting the current, peak, and total value of a number
  */
@@ -83,6 +91,24 @@ class statistics_resource_adaptor final : public device_memory_resource {
  value -= val;
  return *this;
  }
+
+ /**
+ * @brief Add `val` to the current value and update the peak value if necessary
+ *
+ * When updating the peak value, we assume that `val` is tracking a code block inside the
+ * code block tracked by `this`. Because nested statistics are cascading, we have to convert
+ * `val.peak` to the peak it would have been if it was part of the statistics tracked by `this`.
+ * We do this by adding the current value that was active when `val` started tracking such that
+ * we get `std::max(value + val.peak, peak)`.
+ *
+ * @param val Value to add
+ */
+ void add_counters_from_tracked_sub_block(const counter& val)
+ {
+ peak = std::max(value + val.peak, peak);
+ value += val.value;
+ total += val.total;
+ }
  };
 
  /**
@@ -96,6 +122,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
  statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream}
  {
  RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+ // Initially, we push a single counter pair on the stack
+ push_counters();
  }
 
  statistics_resource_adaptor() = delete;
@@ -131,7 +159,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
  {
  read_lock_t lock(mtx_);
 
- return bytes_;
+ return counter_stack_.top().first;
  }
 
  /**
@@ -145,7 +173,40 @@ class statistics_resource_adaptor final : public device_memory_resource {
  {
  read_lock_t lock(mtx_);
 
- return allocations_;
+ return counter_stack_.top().second;
+ }
+
+ /**
+ * @brief Push a pair of zero counters on the stack, which becomes the new
+ * counters returned by `get_bytes_counter()` and `get_allocations_counter()`
+ *
+ * @return top pair of counters <bytes, allocations> from the stack _before_
+ * the push
+ */
+ std::pair<counter, counter> push_counters()
+ {
+ write_lock_t lock(mtx_);
+ auto ret = counter_stack_.top();
+ counter_stack_.push(std::make_pair(counter{}, counter{}));
+ return ret;
+ }
+
+ /**
+ * @brief Pop a pair of counters from the stack
+ *
+ * @return top pair of counters <bytes, allocations> from the stack _before_
+ * the pop
+ */
+ std::pair<counter, counter> pop_counters()
+ {
+ write_lock_t lock(mtx_);
+ if (counter_stack_.size() < 2) { throw std::out_of_range("cannot pop the last counter pair"); }
+ auto ret = counter_stack_.top();
+ counter_stack_.pop();
+ // Update the new top pair of counters
+ counter_stack_.top().first.add_counters_from_tracked_sub_block(ret.first);
+ counter_stack_.top().second.add_counters_from_tracked_sub_block(ret.second);
+ return ret;
  }
 
  private:
@@ -171,8 +232,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
  write_lock_t lock(mtx_);
 
  // Increment the allocation_count_ while we have the lock
- bytes_ += bytes;
- allocations_ += 1;
+ counter_stack_.top().first += bytes;
+ counter_stack_.top().second += 1;
  }
 
  return ptr;
@@ -193,8 +254,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
  write_lock_t lock(mtx_);
 
  // Decrement the current allocated counts.
- bytes_ -= bytes;
- allocations_ -= 1;
+ counter_stack_.top().first -= bytes;
+ counter_stack_.top().second -= 1;
  }
  }
 
@@ -213,10 +274,10 @@ class statistics_resource_adaptor final : public device_memory_resource {
  return get_upstream_resource() == cast->get_upstream_resource();
  }
 
- counter bytes_; // peak, current and total allocated bytes
- counter allocations_; // peak, current and total allocation count
- std::shared_timed_mutex mutable mtx_; // mutex for thread safe access to allocations_
- Upstream* upstream_; // the upstream resource used for satisfying allocation requests
+ // Stack of counter pairs <bytes, allocations>
+ std::stack<std::pair<counter, counter>> counter_stack_;
+ std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_
+ Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
 };
 
 /**

@@ -53,11 +53,10 @@ namespace rmm::mr {
 template <typename Upstream>
 class tracking_resource_adaptor final : public device_memory_resource {
  public:
- // can be a std::shared_mutex once C++17 is adopted
  using read_lock_t =
- std::shared_lock<std::shared_timed_mutex>; ///< Type of lock used to synchronize read access
+ std::shared_lock<std::shared_mutex>; ///< Type of lock used to synchronize read access
  using write_lock_t =
- std::unique_lock<std::shared_timed_mutex>; ///< Type of lock used to synchronize write access
+ std::unique_lock<std::shared_mutex>; ///< Type of lock used to synchronize write access
  /**
  * @brief Information stored about an allocation. Includes the size
  * and a stack trace if the `tracking_resource_adaptor` was initialized
@@ -271,7 +270,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
  bool capture_stacks_; // whether or not to capture call stacks
  std::map<void*, allocation_info> allocations_; // map of active allocations
  std::atomic<std::size_t> allocated_bytes_; // number of bytes currently allocated
- std::shared_timed_mutex mutable mtx_; // mutex for thread safe access to allocations_
+ std::shared_mutex mutable mtx_;  // mutex for thread safe access to allocations_
  Upstream* upstream_; // the upstream resource used for satisfying allocation requests
 };
 

@@ -187,3 +187,106 @@ allocator.
 
 >>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 ```
+
+
+
+## Memory statistics and profiling
+
+RMM can profile memory usage and track memory statistics by using either of the following:
+ - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
+ - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
+
+Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking:
+```python
+>>> import rmm
+>>> import rmm.statistics
+
+>>> # We start with the default cuda memory resource
+>>> rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+
+>>> # When using statistics, we get a StatisticsResourceAdaptor with the context
+>>> with rmm.statistics.statistics():
+... rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+
+>>> # We can also enable statistics globally
+>>> rmm.statistics.enable_statistics()
+>>> print(rmm.mr.get_current_device_resource())
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+```
+
+With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
+```python
+>>> buf = rmm.DeviceBuffer(size=10)
+>>> rmm.statistics.get_statistics()
+Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1)
+```
+
+### Memory Profiler
+To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator:
+```python
+>>> @rmm.statistics.profiler()
+... def f(size):
+... rmm.DeviceBuffer(size=size)
+>>> f(1000)
+
+>>> # By default, the profiler write to rmm.statistics.default_profiler_records
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+ ncalls - number of times the function or code block was called
+ memory_peak - peak memory allocated in function or code block (in bytes)
+ memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+ 1 1,008 1,008 <ipython-input-11-5fc63161ac29>:1(f)
+```
+
+To profile a code block, use `profiler` as a context manager:
+```python
+>>> with rmm.statistics.profiler(name="my code block"):
+... rmm.DeviceBuffer(size=20)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+ ncalls - number of times the function or code block was called
+ memory_peak - peak memory allocated in function or code block (in bytes)
+ memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+ 1 1,008 1,008 <ipython-input-11-5fc63161ac29>:1(f)
+ 1 32 32 my code block
+```
+
+The `profiler` supports nesting:
+```python
+>>> with rmm.statistics.profiler(name="outer"):
+... buf1 = rmm.DeviceBuffer(size=10)
+... with rmm.statistics.profiler(name="inner"):
+... buf2 = rmm.DeviceBuffer(size=10)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+ ncalls - number of times the function or code block was called
+ memory_peak - peak memory allocated in function or code block (in bytes)
+ memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+ 1 1,008 1,008 <ipython-input-4-865fbe04e29f>:1(f)
+ 1 32 32 my code block
+ 1 32 32 outer
+ 1 16 16 inner
+```
@@ -37,3 +37,12 @@ Memory Allocators
  :members:
  :undoc-members:
  :show-inheritance:
+
+Memory Statistics
+-----------------
+
+.. automodule:: rmm.statistics
+ :members:
+ :inherited-members:
+ :undoc-members:
+ :show-inheritance: