Skip to content

Commit cd37436

Browse files
committed
Remove ScratchpadAllocator from KernelManager. Remove num_threads/thread_idx from KernelManager API.
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
1 parent 4c7e414 commit cd37436

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+191
-381
lines changed

dali/kernels/audio/mel_scale/mel_filter_bank_gpu_test.cc

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
1+
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -145,16 +145,15 @@ TEST_P(MelScaleGpuTest, MelScaleGpuTest) {
145145
args.normalize = false;
146146

147147
using Kernel = kernels::audio::MelFilterBankGpu<T>;
148-
kmgr.Initialize<Kernel>();
149-
kmgr.Resize<Kernel>(1, 1);
148+
kmgr.Resize<Kernel>(1);
150149
auto in_view = in_.gpu();
151150
auto req = kmgr.Setup<Kernel>(0, ctx, in_view, args);
152151
ASSERT_EQ(out_shape, req.output_shapes[0]);
153152
TestTensorList<float> out;
154153
out.reshape(out_shape);
155154

156155
auto out_view = out.gpu();
157-
kmgr.Run<Kernel>(0, 0, ctx, out_view, in_view);
156+
kmgr.Run<Kernel>(0, ctx, out_view, in_view);
158157
auto out_view_cpu = out.cpu();
159158
CUDA_CALL(cudaStreamSynchronize(0));
160159
for (int b = 0; b < batch_size; ++b) {

dali/kernels/common/join/tensor_join_gpu_impl_test.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
1+
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct TensorJoinGPUTest : public ::testing::Test {
6868
using Kernel = TensorJoinGPU<T, new_axis>;
6969
CUDAStream stream = CUDAStream::Create(true);
7070
KernelManager mgr;
71-
mgr.Resize<Kernel>(1, 1);
71+
mgr.Resize<Kernel>(1);
7272
KernelContext ctx;
7373
ctx.gpu.stream = stream;
7474

@@ -79,7 +79,7 @@ struct TensorJoinGPUTest : public ::testing::Test {
7979
KernelRequirements &req = mgr.Setup<Kernel>(0, ctx, make_cspan(in_gpu_tls), axis);
8080
ASSERT_EQ(req.output_shapes.size(), 1);
8181
ASSERT_EQ(req.output_shapes[0], out_shape);
82-
mgr.Run<Kernel>(0, 0, ctx, out.gpu(stream), make_cspan(in_gpu_tls));
82+
mgr.Run<Kernel>(0, ctx, out.gpu(stream), make_cspan(in_gpu_tls));
8383

8484
CUDA_CALL(cudaStreamSynchronize(stream));
8585
CheckResult(stream);

dali/kernels/imgproc/jpeg/jpeg_distortion_gpu_test.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
1+
// Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -109,17 +109,17 @@ class JpegDistortionTestGPU : public ::testing::TestWithParam<std::tuple<bool, b
109109
CUDAEvent end = CUDAEvent::CreateWithFlags(0);
110110

111111
kmgr_.Initialize<Kernel>();
112-
kmgr_.Resize<Kernel>(1, 1);
112+
kmgr_.Resize<Kernel>(1);
113113

114114
KernelContext ctx;
115115
ctx.gpu.stream = stream;
116116
auto req = kmgr_.Setup<Kernel>(0, ctx, in_view.shape, horz_subsample, vert_subsample);
117117
if (perf_run) // warm up
118-
kmgr_.Run<Kernel>(0, 0, ctx, out_view, in_view, args...);
118+
kmgr_.Run<Kernel>(0, ctx, out_view, in_view, args...);
119119

120120
CUDA_CALL(cudaEventRecord(start, stream));
121121

122-
kmgr_.Run<Kernel>(0, 0, ctx, out_view, in_view, args...);
122+
kmgr_.Run<Kernel>(0, ctx, out_view, in_view, args...);
123123
CUDA_CALL(cudaGetLastError());
124124

125125
CUDA_CALL(cudaEventRecord(end, stream));

dali/kernels/kernel_manager.cc

-46
This file was deleted.

dali/kernels/kernel_manager.h

+26-140
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -29,18 +29,6 @@
2929
namespace dali {
3030
namespace kernels {
3131

32-
template <typename T>
33-
T atomic_max(std::atomic<T> &value, const T &store_if_greater) {
34-
T old = value.load();
35-
for (;;) {
36-
if (!(store_if_greater > old))
37-
return old;
38-
39-
if (value.compare_exchange_strong(old, store_if_greater))
40-
return store_if_greater;
41-
}
42-
}
43-
4432
struct AnyKernelInstance {
4533
KernelRequirements requirements;
4634
std::unique_ptr<void, void(*)(void*)> instance = { nullptr, free };
@@ -50,7 +38,8 @@ struct AnyKernelInstance {
5038
void (*deleter)(void *) = delete_kernel<Kernel>;
5139
if (!instance || instance.get_deleter() != deleter) {
5240
instance.reset();
53-
instance = { new Kernel{std::forward<Args>(args)...}, deleter };
41+
Kernel *k = new Kernel{std::forward<Args>(args)...};
42+
instance = { k, deleter };
5443
}
5544
return *static_cast<Kernel*>(instance.get());
5645
}
@@ -78,47 +67,36 @@ struct AnyKernelInstance {
7867
* @brief Manages multiple instances of run-time typed kernels
7968
*
8069
* KernelManager provides type erasure for kernels whose type is selected at
81-
* run-time. Kernel manager also carries out mundane tasks of keeping
82-
* ScratchpadAllocators and reserving memory according to requirements returned
83-
* by kernel's Setup method.
84-
*
85-
* A scratchpad allocator is created per-thread with thread indexing supported
86-
* explicitly by the caller.
70+
* run-time.
8771
*/
8872
class DLL_PUBLIC KernelManager {
8973
public:
9074
static constexpr size_t NumMemKinds = ScratchpadAllocator::NumMemKinds;
9175
using ScratchSizes = std::array<size_t, NumMemKinds>;
9276

9377
/**
94-
* @brief Creates `num_threads` scratchpads and `num_instances` slots for kernels
78+
* @brief Creates `num_instances` slots for kernels
9579
*
96-
* @param num_threads - number of threads that can concurrently use the kernels in the
97-
* manager, assuming that each threads uses its unique
98-
* zero-based index
9980
* @param num_instances - number of Kernel instances to be created; typically corresponds
10081
* to number of samples (for per-sample kernels) or minibatches
10182
*/
102-
void Resize(size_t num_threads, size_t num_instances);
83+
void Resize(size_t num_instances) { instances.resize(num_instances); }
10384

10485
/**
105-
* @brief Creates `num_threads` scratchpads and `num_instances` kernels of type Kernel
106-
* constructed with `args...`.
86+
* @brief Creates `num_instances` kernels of type Kernel constructed with `args...`.
10787
*
108-
* @param num_threads - number of threads that can concurrently use the kernels in the
109-
* manager, assuming that each threads uses its unique
110-
* zero-based index
11188
* @param num_instances - number of Kernel instances to be created; typically corresponds
11289
* to number of samples (for per-sample kernels) or minibatches
11390
* @param args - arguments passed to Kernel's constructor upon creation.
11491
* @tparam Kernel - type of the kernel to be created
11592
*/
11693
template <typename Kernel, typename... Args>
117-
void Resize(size_t num_threads, size_t num_instances, const Args&... args) {
118-
Resize(num_threads, num_instances);
94+
void Resize(size_t num_instances, const Args&... args) {
95+
Resize(num_instances);
11996
Initialize<Kernel>(args...);
12097
}
12198

99+
122100
/**
123101
* @brief Populates the instance slots with instances of a given Kernel
124102
*
@@ -132,9 +110,11 @@ class DLL_PUBLIC KernelManager {
132110
}
133111

134112
/**
135-
* @brief Clears kernel instances and scratchpads
113+
* @brief Clears kernel instances
136114
*/
137-
void Reset();
115+
void Reset() {
116+
instances.clear();
117+
}
138118

139119
/**
140120
* @brief Gets or creates a Kernel instance
@@ -172,14 +152,6 @@ class DLL_PUBLIC KernelManager {
172152
}
173153

174154
size_t NumInstances() const noexcept { return instances.size(); }
175-
size_t NumThreads() const noexcept { return scratchpads.size(); }
176-
177-
/**
178-
* @brief Gets a scratchpad allocator assigned to a given thread.
179-
*/
180-
ScratchpadAllocator &GetScratchpadAllocator(int thread_idx) {
181-
return scratchpads[thread_idx];
182-
}
183155

184156
/**
185157
* @brief Calls setup on specified kernel instance.
@@ -190,130 +162,44 @@ class DLL_PUBLIC KernelManager {
190162
* * should contain valid CUDA stream for GPU kernels;
191163
* @param in_args - pack of arguments (inputs, arguments) used in Kernel::Setup
192164
* @return Reference to internally maintained copy of the kernel requirements.
193-
* @remarks The copies of KernelRequirements for each instance index are used for allocating
194-
* scratch memory. While the function returns non-const reference, please note
195-
* that decreasing scratch sizes calculated by Setup will result in undefined
196-
* behavior, including memory corruption or illegal access.
197165
*/
198166
template <typename Kernel, typename... InArgs>
199167
KernelRequirements &Setup(int instance_idx, KernelContext &context, InArgs &&...in_args) {
200168
auto &inst = instances[instance_idx];
201169
inst.requirements = inst.get<Kernel>().Setup(context, std::forward<InArgs>(in_args)...);
202-
for (size_t i = 0; i < max_scratch_sizes.size(); i++) {
203-
atomic_max(max_scratch_sizes[i], inst.requirements.scratch_sizes[i]);
204-
}
205170
return inst.requirements;
206171
}
207172

208173
/**
209-
* @brief Calls Run on specified kernel instance using Scratchpad for given thread.
210-
*
211-
* @param thread_idx - zero-based thread index
212-
* @param instance_idx - kernel instance index; typically corresponds
213-
* to sample index (for per-sample kernels) or minibatch index
214-
* @param context - context for the kernel
215-
* * should contain valid CUDA stream for GPU kernels;
216-
* * scratchpad pointer is overriden with a scratchpad
217-
* created for given thread_idx
218-
* @param out_in_args - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
219-
*/
220-
template <typename Kernel, typename... OutInArgs>
221-
void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
222-
assert(instance_idx >= 0 &&
223-
static_cast<size_t>(instance_idx) < NumInstances() &&
224-
"Kernel instance index (instance_idx) out of range");
225-
auto &inst = instances[instance_idx];
226-
DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
227-
auto *old_scratchpad = context.scratchpad;
228-
context.scratchpad = &scratchpad;
229-
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
230-
context.scratchpad = old_scratchpad;
231-
}
232-
233-
/**
234-
* @brief Calls Run on specified kernel instance using Scratchpad for given thread.
174+
* @brief Calls Run on specified kernel instance
235175
*
236-
* @param sa - scratchpad allocator; memory will be reserved in it to satisfy
237-
* instance's requirements
238176
* @param instance_idx - kernel instance index; typically corresponds
239177
* to sample index (for per-sample kernels) or minibatch index
240178
* @param context - context for the kernel
241179
* * should contain valid CUDA stream for GPU kernels;
242-
* * scratchpad pointer is overriden with a scratchpad
243-
* created from `sa`
180+
* * if scratchpad pointer is null, a temporary dynamic scratchpad is
181+
* created
244182
* @param out_in_args - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
245183
*/
246184
template <typename Kernel, typename... OutInArgs>
247-
void Run(ScratchpadAllocator &sa,
248-
int instance_idx,
249-
KernelContext &context,
250-
OutInArgs &&...out_in_args) {
185+
void Run(int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
251186
assert(instance_idx >= 0 &&
252187
static_cast<size_t>(instance_idx) < NumInstances() &&
253188
"Kernel instance index (instance_idx) out of range");
254189
auto &inst = instances[instance_idx];
255-
auto scratchpad = ReserveScratchpad(sa, inst.requirements.scratch_sizes);
256-
auto *old_scratchpad = context.scratchpad;
257-
context.scratchpad = &scratchpad;
258-
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
259-
context.scratchpad = old_scratchpad;
260-
}
261-
262-
/**
263-
* @brief Makes sure ScratchpadAllocator can accommodate `sizes`
264-
*
265-
* @param sa - scratchpad allocator to reserve
266-
* @param sizes - requested minimum size
267-
*
268-
* The manager maintains a lifetime maximum of sizes requested.
269-
* If reallocation is necessary, it allocates `sizes` or that maximum
270-
* whichever is larger.
271-
*/
272-
auto ReserveScratchpad(ScratchpadAllocator &sa, const ScratchSizes &sizes)->
273-
decltype(sa.GetScratchpad());
274-
275-
/**
276-
* @brief Calls ReserveScratchpad on ScratchpadAllocator associated with given thread_idx
277-
*/
278-
inline auto ReserveScratchpad(int thread_idx, const ScratchSizes &sizes) {
279-
return ReserveScratchpad(GetScratchpadAllocator(thread_idx), sizes);
280-
}
281-
282-
/**
283-
* @brief Returns maximum scratchpad size seen so far
284-
*/
285-
inline ScratchSizes MaxScratchSizes() const {
286-
ScratchSizes sizes;
287-
for (size_t i = 0; i < sizes.size(); i++) {
288-
sizes[i] = max_scratch_sizes[i];
190+
if (!context.scratchpad) {
191+
DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
192+
auto *old_scratchpad = context.scratchpad;
193+
context.scratchpad = &scratchpad;
194+
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
195+
context.scratchpad = old_scratchpad;
196+
} else {
197+
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
289198
}
290-
return sizes;
291-
}
292-
293-
/**
294-
* @brief Reserves scratchpad big enough to accommodate largest scratch area ever seen
295-
*/
296-
inline auto ReserveMaxScratchpad(int thread_idx) {
297-
return ReserveScratchpad(thread_idx, MaxScratchSizes());
298-
}
299-
300-
/**
301-
* @brief Sets a memory size hint for allocating scratchpad memory
302-
*
303-
* All calls to ScratchpadAllocator::Reserve followint this call will request at least
304-
* bytes memory for given allocation type.
305-
*/
306-
template <typename MemoryKind>
307-
void SetMemoryHint(size_t bytes) {
308-
size_t alloc_idx = static_cast<size_t>(mm::kind2id_v<MemoryKind>);
309-
assert(alloc_idx < max_scratch_sizes.size());
310-
atomic_max(max_scratch_sizes[alloc_idx], bytes);
311199
}
312200

313201
private:
314202
SmallVector<AnyKernelInstance, 1> instances;
315-
SmallVector<ScratchpadAllocator, 1> scratchpads;
316-
std::array<std::atomic_size_t, NumMemKinds> max_scratch_sizes{};
317203
};
318204

319205
} // namespace kernels

0 commit comments

Comments
 (0)