NVIDIA
diff --git a/‎dali/kernels/audio/mel_scale/mel_filter_bank_gpu_test.cc
+3-4 b/‎dali/kernels/audio/mel_scale/mel_filter_bank_gpu_test.cc
+3-4
diff --git a/‎dali/kernels/common/join/tensor_join_gpu_impl_test.cu
+3-3 b/‎dali/kernels/common/join/tensor_join_gpu_impl_test.cu
+3-3
diff --git a/‎dali/kernels/imgproc/jpeg/jpeg_distortion_gpu_test.cu
+4-4 b/‎dali/kernels/imgproc/jpeg/jpeg_distortion_gpu_test.cu
+4-4
diff --git a/‎dali/kernels/kernel_manager.cc
-46 b/‎dali/kernels/kernel_manager.cc
-46
diff --git a/‎dali/kernels/kernel_manager.h
+26-140 b/‎dali/kernels/kernel_manager.h
+26-140
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -145,16 +145,15 @@ TEST_P(MelScaleGpuTest, MelScaleGpuTest) {
   args.normalize = false;
 
   using Kernel = kernels::audio::MelFilterBankGpu<T>;
-  kmgr.Initialize<Kernel>();
-  kmgr.Resize<Kernel>(1, 1);
+  kmgr.Resize<Kernel>(1);
   auto in_view = in_.gpu();
   auto req = kmgr.Setup<Kernel>(0, ctx, in_view, args);
   ASSERT_EQ(out_shape, req.output_shapes[0]);
   TestTensorList<float> out;
   out.reshape(out_shape);
 
   auto out_view = out.gpu();
-  kmgr.Run<Kernel>(0, 0, ctx, out_view, in_view);
+  kmgr.Run<Kernel>(0, ctx, out_view, in_view);
   auto out_view_cpu = out.cpu();
   CUDA_CALL(cudaStreamSynchronize(0));
   for (int b = 0; b < batch_size; ++b) {
 
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct TensorJoinGPUTest : public ::testing::Test {
     using Kernel = TensorJoinGPU<T, new_axis>;
     CUDAStream stream = CUDAStream::Create(true);
     KernelManager mgr;
-    mgr.Resize<Kernel>(1, 1);
+    mgr.Resize<Kernel>(1);
     KernelContext ctx;
     ctx.gpu.stream = stream;
 
@@ -79,7 +79,7 @@ struct TensorJoinGPUTest : public ::testing::Test {
       KernelRequirements &req = mgr.Setup<Kernel>(0, ctx, make_cspan(in_gpu_tls), axis);
       ASSERT_EQ(req.output_shapes.size(), 1);
       ASSERT_EQ(req.output_shapes[0], out_shape);
-      mgr.Run<Kernel>(0, 0, ctx, out.gpu(stream), make_cspan(in_gpu_tls));
+      mgr.Run<Kernel>(0, ctx, out.gpu(stream), make_cspan(in_gpu_tls));
 
       CUDA_CALL(cudaStreamSynchronize(stream));
       CheckResult(stream);
 
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -109,17 +109,17 @@ class JpegDistortionTestGPU : public ::testing::TestWithParam<std::tuple<bool, b
     CUDAEvent end = CUDAEvent::CreateWithFlags(0);
 
     kmgr_.Initialize<Kernel>();
-    kmgr_.Resize<Kernel>(1, 1);
+    kmgr_.Resize<Kernel>(1);
 
     KernelContext ctx;
     ctx.gpu.stream = stream;
     auto req = kmgr_.Setup<Kernel>(0, ctx, in_view.shape, horz_subsample, vert_subsample);
     if (perf_run)  // warm up
-      kmgr_.Run<Kernel>(0, 0, ctx, out_view, in_view, args...);
+      kmgr_.Run<Kernel>(0, ctx, out_view, in_view, args...);
 
     CUDA_CALL(cudaEventRecord(start, stream));
 
-    kmgr_.Run<Kernel>(0, 0, ctx, out_view, in_view, args...);
+    kmgr_.Run<Kernel>(0, ctx, out_view, in_view, args...);
     CUDA_CALL(cudaGetLastError());
 
     CUDA_CALL(cudaEventRecord(end, stream));
 
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,18 +29,6 @@
 namespace dali {
 namespace kernels {
 
-template <typename T>
-T atomic_max(std::atomic<T> &value, const T &store_if_greater) {
-  T old = value.load();
-  for (;;) {
-    if (!(store_if_greater > old))
-      return old;
-
-    if (value.compare_exchange_strong(old, store_if_greater))
-      return store_if_greater;
-  }
-}
-
 struct AnyKernelInstance {
   KernelRequirements requirements;
   std::unique_ptr<void, void(*)(void*)> instance = { nullptr, free };
@@ -50,7 +38,8 @@ struct AnyKernelInstance {
     void (*deleter)(void *) = delete_kernel<Kernel>;
     if (!instance || instance.get_deleter() != deleter) {
       instance.reset();
-      instance = { new Kernel{std::forward<Args>(args)...}, deleter };
+      Kernel *k = new Kernel{std::forward<Args>(args)...};
+      instance = { k, deleter };
     }
     return *static_cast<Kernel*>(instance.get());
   }
@@ -78,47 +67,36 @@ struct AnyKernelInstance {
  * @brief Manages multiple instances of run-time typed kernels
  *
  * KernelManager provides type erasure for kernels whose type is selected at
- * run-time. Kernel manager also carries out mundane tasks of keeping
- * ScratchpadAllocators and reserving memory according to requirements returned
- * by kernel's Setup method.
- *
- * A scratchpad allocator is created per-thread with thread indexing supported
- * explicitly by the caller.
+ * run-time.
  */
 class DLL_PUBLIC KernelManager {
  public:
   static constexpr size_t NumMemKinds = ScratchpadAllocator::NumMemKinds;
   using ScratchSizes = std::array<size_t, NumMemKinds>;
 
   /**
-   * @brief Creates `num_threads` scratchpads and `num_instances` slots for kernels
+   * @brief Creates `num_instances` slots for kernels
    *
-   * @param num_threads -    number of threads that can concurrently use the kernels in the
-   *                         manager, assuming that each threads uses its unique
-   *                         zero-based index
    * @param num_instances -  number of Kernel instances to be created; typically corresponds
    *                         to number of samples (for per-sample kernels) or minibatches
    */
-  void Resize(size_t num_threads, size_t num_instances);
+  void Resize(size_t num_instances) { instances.resize(num_instances); }
 
   /**
-   * @brief Creates `num_threads` scratchpads and `num_instances` kernels of type Kernel
-   *        constructed with `args...`.
+   * @brief Creates `num_instances` kernels of type Kernel constructed with `args...`.
    *
-   * @param num_threads   -  number of threads that can concurrently use the kernels in the
-   *                         manager, assuming that each threads uses its unique
-   *                         zero-based index
    * @param num_instances -  number of Kernel instances to be created; typically corresponds
    *                         to number of samples (for per-sample kernels) or minibatches
    * @param args           - arguments passed to Kernel's constructor upon creation.
    * @tparam Kernel        - type of the kernel to be created
    */
   template <typename Kernel, typename... Args>
-  void Resize(size_t num_threads, size_t num_instances, const Args&... args) {
-    Resize(num_threads, num_instances);
+  void Resize(size_t num_instances, const Args&... args) {
+    Resize(num_instances);
     Initialize<Kernel>(args...);
   }
 
+
   /**
    * @brief Populates the instance slots with instances of a given Kernel
    *
@@ -132,9 +110,11 @@ class DLL_PUBLIC KernelManager {
   }
 
   /**
-   * @brief Clears kernel instances and scratchpads
+   * @brief Clears kernel instances
    */
-  void Reset();
+  void Reset() {
+    instances.clear();
+  }
 
   /**
    * @brief Gets or creates a Kernel instance
@@ -172,14 +152,6 @@ class DLL_PUBLIC KernelManager {
   }
 
   size_t NumInstances() const noexcept { return instances.size(); }
-  size_t NumThreads() const noexcept { return scratchpads.size(); }
-
-  /**
-   * @brief Gets a scratchpad allocator assigned to a given thread.
-   */
-  ScratchpadAllocator &GetScratchpadAllocator(int thread_idx) {
-    return scratchpads[thread_idx];
-  }
 
   /**
    * @brief Calls setup on specified kernel instance.
@@ -190,130 +162,44 @@ class DLL_PUBLIC KernelManager {
    *                         * should contain valid CUDA stream for GPU kernels;
    * @param in_args        - pack of arguments (inputs, arguments) used in Kernel::Setup
    * @return Reference to internally maintained copy of the kernel requirements.
-   * @remarks The copies of KernelRequirements for each instance index are used for allocating
-   *          scratch memory. While the function returns non-const reference, please note
-   *          that decreasing scratch sizes calculated by Setup will result in undefined
-   *          behavior, including memory corruption or illegal access.
    */
   template <typename Kernel, typename... InArgs>
   KernelRequirements &Setup(int instance_idx, KernelContext &context, InArgs &&...in_args) {
     auto &inst = instances[instance_idx];
     inst.requirements = inst.get<Kernel>().Setup(context, std::forward<InArgs>(in_args)...);
-    for (size_t i = 0; i < max_scratch_sizes.size(); i++) {
-      atomic_max(max_scratch_sizes[i], inst.requirements.scratch_sizes[i]);
-    }
     return inst.requirements;
   }
 
   /**
-   * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
-   *
-   * @param thread_idx     - zero-based thread index
-   * @param instance_idx   - kernel instance index; typically corresponds
-   *                         to sample index (for per-sample kernels) or minibatch index
-   * @param context        - context for the kernel
-   *                         * should contain valid CUDA stream for GPU kernels;
-   *                         * scratchpad pointer is overriden with a scratchpad
-   *                           created for given thread_idx
-   * @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
-   */
-  template <typename Kernel, typename... OutInArgs>
-  void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
-    assert(instance_idx >= 0 &&
-           static_cast<size_t>(instance_idx) < NumInstances() &&
-           "Kernel instance index (instance_idx) out of range");
-    auto &inst = instances[instance_idx];
-    DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
-    auto *old_scratchpad = context.scratchpad;
-    context.scratchpad = &scratchpad;
-    inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
-    context.scratchpad = old_scratchpad;
-  }
-
-  /**
-   * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
+   * @brief Calls Run on specified kernel instance
    *
-   * @param sa             - scratchpad allocator; memory will be reserved in it to satisfy
-   *                         instance's requirements
    * @param instance_idx   - kernel instance index; typically corresponds
    *                         to sample index (for per-sample kernels) or minibatch index
    * @param context        - context for the kernel
    *                         * should contain valid CUDA stream for GPU kernels;
-   *                         * scratchpad pointer is overriden with a scratchpad
-   *                           created from `sa`
+   *                         * if scratchpad pointer is null, a temporary dynamic scratchpad is
+   *                           created
    * @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
    */
   template <typename Kernel, typename... OutInArgs>
-  void Run(ScratchpadAllocator &sa,
-           int instance_idx,
-           KernelContext &context,
-           OutInArgs &&...out_in_args) {
+  void Run(int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
     assert(instance_idx >= 0 &&
            static_cast<size_t>(instance_idx) < NumInstances() &&
            "Kernel instance index (instance_idx) out of range");
     auto &inst = instances[instance_idx];
-    auto scratchpad = ReserveScratchpad(sa, inst.requirements.scratch_sizes);
-    auto *old_scratchpad = context.scratchpad;
-    context.scratchpad = &scratchpad;
-    inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
-    context.scratchpad = old_scratchpad;
-  }
-
-  /**
-   * @brief Makes sure ScratchpadAllocator can accommodate `sizes`
-   *
-   * @param sa     - scratchpad allocator to reserve
-   * @param sizes  - requested minimum size
-   *
-   * The manager maintains a lifetime maximum of sizes requested.
-   * If reallocation is necessary, it allocates `sizes` or that maximum
-   * whichever is larger.
-   */
-  auto ReserveScratchpad(ScratchpadAllocator &sa, const ScratchSizes &sizes)->
-  decltype(sa.GetScratchpad());
-
-  /**
-   * @brief Calls ReserveScratchpad on ScratchpadAllocator associated with given thread_idx
-   */
-  inline auto ReserveScratchpad(int thread_idx, const ScratchSizes &sizes) {
-    return ReserveScratchpad(GetScratchpadAllocator(thread_idx), sizes);
-  }
-
-  /**
-   * @brief Returns maximum scratchpad size seen so far
-   */
-  inline ScratchSizes MaxScratchSizes() const {
-    ScratchSizes sizes;
-    for (size_t i = 0; i < sizes.size(); i++) {
-      sizes[i] = max_scratch_sizes[i];
+    if (!context.scratchpad) {
+      DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
+      auto *old_scratchpad = context.scratchpad;
+      context.scratchpad = &scratchpad;
+      inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
+      context.scratchpad = old_scratchpad;
+    } else {
+      inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
     }
-    return sizes;
-  }
-
-  /**
-   * @brief Reserves scratchpad big enough to accommodate largest scratch area ever seen
-   */
-  inline auto ReserveMaxScratchpad(int thread_idx) {
-    return ReserveScratchpad(thread_idx, MaxScratchSizes());
-  }
-
-  /**
-   * @brief Sets a memory size hint for allocating scratchpad memory
-   *
-   * All calls to ScratchpadAllocator::Reserve followint this call will request at least
-   * bytes memory for given allocation type.
-   */
-  template <typename MemoryKind>
-  void SetMemoryHint(size_t bytes) {
-    size_t alloc_idx = static_cast<size_t>(mm::kind2id_v<MemoryKind>);
-    assert(alloc_idx < max_scratch_sizes.size());
-    atomic_max(max_scratch_sizes[alloc_idx], bytes);
   }
 
  private:
   SmallVector<AnyKernelInstance, 1> instances;
-  SmallVector<ScratchpadAllocator, 1> scratchpads;
-  std::array<std::atomic_size_t, NumMemKinds> max_scratch_sizes{};
 };
 
 }  // namespace kernels