NVIDIA · mzient · Feb 10, 2022 · Sep 14, 2021 · Jan 19, 2022 · Feb 9, 2022
diff --git a/dali/kernels/dynamic_scratchpad.h b/dali/kernels/dynamic_scratchpad.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
+#define DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
+
+#include <array>
+#include <cassert>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include "dali/core/static_switch.h"
+#include "dali/core/mm/fixed_order_resource.h"
+#include "dali/core/mm/memory.h"
+#include "dali/core/mm/memory_kind.h"
+#include "dali/core/mm/monotonic_resource.h"
+#include "dali/kernels/context.h"
+#include "dali/kernels/kernel_req.h"
+
+namespace dali {
+namespace kernels {
+
+namespace detail {
+
+template <typename T, typename... Ts>
+struct index_in_pack;
+
+template <typename T, typename... Ts>
+struct index_in_pack<T, T, Ts...> : std::integral_constant<int, 0> {};
+
+template <typename T, typename U, typename... Ts>
+struct index_in_pack<T, U, Ts...> :
+    std::integral_constant<int, index_in_pack<T, Ts...>::value + 1> {};
+
+/**
+ * @brief Implements upstream handling and ordered wrappers.
+ */
+template <typename... Kinds>
+class DynamicScratchpadImplT {
+ protected:
+  template <typename Kind>
+  void set_upstream_resource(mm::memory_resource<Kind> *rsrc) {
+    resource<Kind>() = mm::monotonic_memory_resource<Kind>(rsrc, initial_size<Kind>());
+  }
+
+  template <typename Kind>
+  void set_upstream_resource(mm::async_memory_resource<Kind> *rsrc,
+                             AccessOrder alloc_order,
+                             AccessOrder dealloc_order = {}) {
+    static_assert(!std::is_same<Kind, mm::memory_kind::host>::value,
+      "Cannot use a stream-ordered resource for plain host memory");
+    adapter<Kind>() = { rsrc, alloc_order, dealloc_order };
+    set_upstream_resource<Kind>(&adapter<Kind>());
+  }
+
+  template <typename Kind>
+  size_t &initial_size() {
+    return initial_sizes_[index_in_pack<Kind, Kinds...>::value];
+  }
+
+  template <typename Kind>
+  size_t initial_size() const {
+    return initial_sizes_[index_in_pack<Kind, Kinds...>::value];
+  }
+
+  template <typename Kind>
+  mm::memory_resource<Kind> *get_upstream() const {
+    std::get<mm::monotonic_memory_resource<Kind>>(resources_)->get_upstream();
+  }
+
+  template <typename Kind>
+  auto &adapter() {
+    return std::get<mm::fixed_order_resource<Kind>>(adapters_);
+  }
+
+  template <typename Kind>
+  auto &adapter() const {
+    return std::get<mm::fixed_order_resource<Kind>>(adapters_);
+  }
+
+  template <typename Kind>
+  auto &resource() {
+    return std::get<mm::monotonic_memory_resource<Kind>>(resources_);
+  }
+
+  template <typename Kind>
+  auto &resource() const {
+    return std::get<mm::monotonic_memory_resource<Kind>>(resources_);
+  }
+
+  std::tuple<mm::fixed_order_resource<Kinds>...>      adapters_;
+  std::tuple<mm::monotonic_memory_resource<Kinds>...> resources_;
+  std::array<size_t, sizeof...(Kinds)>                initial_sizes_ = {};
+};
+
+using DynamicScratchpadImpl = DynamicScratchpadImplT<
+      mm::memory_kind::host,
+      mm::memory_kind::pinned,
+      mm::memory_kind::device,
+      mm::memory_kind::managed>;
+
+}  // namespace detail
+
+class DynamicScratchpad
+  : public Scratchpad
+  , private detail::DynamicScratchpadImpl {
+ public:
+  /**
+   * @brief Constructs a dynamically allocated scratchpad
+   *
+   * @param initial_sizes   Sizes, in bytes, of the initial buffers. Note that these buffers
+   *                        are allocated lazily, so nothing is allocated if there's no request
+   *                        for memory of any given kind.
+   * @param device_order    Allocation and deallocation order for device memory.
+   * @param pinned_dealloc_order  Deallocation order for pinned memory. Allocation is always
+   *                              host-ordered. If not set, device_order is used.
+   * @param managed_dealloc_order Deallocation order for managed memory. Allocation is always
+   *                              host-ordered. If not set, device_order is used.
+   */
+  explicit DynamicScratchpad(scratch_sizes_t initial_sizes = {},
+                             AccessOrder device_order = cudaStream_t(0),
+                             AccessOrder pinned_dealloc_order = {},
+                             AccessOrder managed_dealloc_order = {}) {
+    initial_sizes_ = initial_sizes;
+    for (auto &s : initial_sizes_) {
+      if (s == 0)
+        s = 4096;
+    }
+    if (!pinned_dealloc_order.has_value())
+      pinned_dealloc_order = device_order;
+    if (!managed_dealloc_order.has_value())
+      managed_dealloc_order = device_order;
+
+    set_upstream_resource<mm::memory_kind::host>(mm::GetDefaultResource<mm::memory_kind::host>());
+
+    set_upstream_resource<mm::memory_kind::pinned>(
+        mm::GetDefaultResource<mm::memory_kind::pinned>(),
+        AccessOrder::host(),
+        pinned_dealloc_order);
+
+    set_upstream_resource<mm::memory_kind::device>(
+        mm::GetDefaultResource<mm::memory_kind::device>(),
+        device_order);
+
+    set_upstream_resource<mm::memory_kind::managed>(
+        mm::GetDefaultResource<mm::memory_kind::managed>(),
+        AccessOrder::host(),
+        managed_dealloc_order);
+  }
+
+  virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
+    void *ret = nullptr;
+    TYPE_SWITCH(kind_id, mm::kind2id, Kind,
+      (mm::memory_kind::host,
+       mm::memory_kind::pinned,
+       mm::memory_kind::device,
+       mm::memory_kind::managed),
+      (ret = resource<Kind>().allocate(bytes, alignment)),
+      (assert(!"Incorrect memory kind id");));
+    return ret;
+  }
+};
+
+}  // namespace kernels
+}  // namespace dali
+
+#endif  // DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
+
diff --git a/dali/kernels/dynamic_scratchpad_test.cc b/dali/kernels/dynamic_scratchpad_test.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/kernels/dynamic_scratchpad.h"  // NOLINT
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <chrono>
+#include <random>
+#include <string>
+#include <vector>
+#include "dali/core/cuda_utils.h"
+#include "dali/core/cuda_stream_pool.h"
+#include "dali/core/mm/memory.h"
+
+namespace dali {
+namespace kernels {
+namespace test {
+
+/**
+ * @brief Tests basic dynamic scratchpad functioning
+ *
+ * This test checks that:
+ * - the memory is usable and accessible on the right backend
+ * - the pinned memory block is released in stream order, which prevents
+ *   immediate reuse if the stream is still running
+ * - it makes multiple attempts to catch the stream still running
+ */
+TEST(DynamicScratchpad, BasicTest) {
+  const int N = 64 << 10;  // 64 KiB
+
+  std::vector<char> in(N);
+  for (int i = 0; i < N; i++)
+    in[i] = i + 42;  // so it doesn't start or end with 0
+
+  auto stream = CUDAStreamPool::instance().Get();
+  auto dev = mm::alloc_raw_unique<char, mm::memory_kind::device>(N);
+  int max_attempts = 1000;
+  bool was_running = false;
+  for (int attempt = 0; attempt < max_attempts; attempt++) {
+    char *pinned;
+    {
+      DynamicScratchpad scratch({}, AccessOrder(stream));
+      pinned = scratch.Allocate<mm::memory_kind::pinned, char>(N);
+      memcpy(pinned, in.data(), N);
+      CUDA_CALL(cudaMemcpyAsync(dev.get(), pinned, N, cudaMemcpyHostToDevice, stream));
+    }
+    auto out = mm::alloc_raw_unique<char, mm::memory_kind::pinned>(N);
+    bool running = false;
+    if (was_running) {
+      CUDA_CALL(cudaStreamSynchronize(stream));
+    } else {
+      running = cudaStreamQuery(stream) == cudaErrorNotReady;
+      if (running)
+        was_running = true;
+    }
+    ASSERT_TRUE(out.get() + N < pinned || out.get() >= pinned + N || !running);
+    CUDA_CALL(cudaMemcpyAsync(out.get(), dev.get(), N, cudaMemcpyDeviceToHost, stream));
+    CUDA_CALL(cudaStreamSynchronize(stream));
+    ASSERT_EQ(memcmp(in.data(), out.get(), N), 0);
+    if (was_running && !running)
+      break;
+  }
+  if (!was_running)
+    std::cerr << "Warning: Test incomplete - the stream was never caught still running"
+              << std::endl;
+}
+
+inline void ProcessResults(vector<double> &times, const string &header) {
+  std::sort(times.begin(), times.end());
+  double sum = std::accumulate(times.begin(), times.end(), 0);
+  auto b98 = times.begin() + times.size()/100;
+  auto e98 = times.end() - times.size()/100;
+  double sum98 = std::accumulate(b98, e98, 0);
+  std::cout << header << "\n"
+            << "Median time:            " << times[times.size()/2] << " ns\n"
+            << "90th percentile:        " << times[times.size()*90/100] << " ns\n"
+            << "99th percentile:        " << times[times.size()*99/100] << " ns\n"
+            << "Mean time:              " << sum/times.size() << " ns\n"
+            << "Mean time (middle 98%): " << sum98/(e98-b98) << " ns\n";
+}
+
+TEST(DynamicScratchpad, Perf) {
+  std::poisson_distribution size_dist(1024);  // 1 KiB average
+  int max_size = 64 << 20;  // 64 MiB max
+  std::uniform_int_distribution<> num_dist(1, 100);
+
+  std::mt19937_64 rng(1234);
+
+  auto stream1 = CUDAStreamPool::instance().Get();
+  auto stream2 = CUDAStreamPool::instance().Get();
+  cudaStream_t streams[] = { stream1, stream2 };
+
+  int max_attempts = 100000;
+
+  const int nkinds = static_cast<int>(mm::memory_kind_id::count);
+  std::vector<double> alloc_times[nkinds];
+  std::vector<double> destroy_times;
+  for (auto &v : alloc_times)
+    v.reserve(max_attempts*100);
+  destroy_times.reserve(max_attempts);
+
+  for (int attempt = 0; attempt < max_attempts; attempt++) {
+    auto s = streams[attempt % 2];
+    std::aligned_storage_t<sizeof(DynamicScratchpad), alignof(DynamicScratchpad)> scratch_placement;
+    auto *scratch = new(&scratch_placement) DynamicScratchpad({}, AccessOrder(s));
+    for (int k = 0; k < nkinds; k++) {
+      auto kind = static_cast<mm::memory_kind_id>(k);
+      if (kind == mm::memory_kind_id::managed)
+        continue;
+      int n = num_dist(rng);
+      for (int i = 0; i < n; i++) {
+        size_t size = std::min(size_dist(rng), max_size);
+        auto s = std::chrono::high_resolution_clock::now();
+        scratch->Alloc(kind, size, alignof(std::max_align_t));
+        auto e = std::chrono::high_resolution_clock::now();
+        alloc_times[k].push_back((e-s).count());
+      }
+    }
+    {
+      auto s = std::chrono::high_resolution_clock::now();
+      scratch->DynamicScratchpad::~DynamicScratchpad();
+      auto e = std::chrono::high_resolution_clock::now();
+      destroy_times.push_back((e-s).count());
+    }
+  }
+
+  const char *names[] = { "host", "pinned", "device", "managed" };
+
+  for (int k = 0; k < nkinds; k++) {
+    if (k == mm::memory_kind_id::managed)
+      continue;
+    ProcessResults(alloc_times[k],
+                   make_string("Allocation performance for ", names[k], " memory"));
+  }
+
+  ProcessResults(destroy_times, "Scratchpad destruction time");
+}
+
+
+}  // namespace test
+}  // namespace kernels
+}  // namespace dali