1
- // Copyright (c) 2019-2021 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1
+ // Copyright (c) 2019-2022 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
//
3
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
4
// you may not use this file except in compliance with the License.
29
29
namespace dali {
30
30
namespace kernels {
31
31
32
- template <typename T>
33
- T atomic_max (std::atomic<T> &value, const T &store_if_greater) {
34
- T old = value.load ();
35
- for (;;) {
36
- if (!(store_if_greater > old))
37
- return old;
38
-
39
- if (value.compare_exchange_strong (old, store_if_greater))
40
- return store_if_greater;
41
- }
42
- }
43
-
44
32
struct AnyKernelInstance {
45
33
KernelRequirements requirements;
46
34
std::unique_ptr<void , void (*)(void *)> instance = { nullptr , free };
@@ -50,7 +38,8 @@ struct AnyKernelInstance {
50
38
void (*deleter)(void *) = delete_kernel<Kernel>;
51
39
if (!instance || instance.get_deleter () != deleter) {
52
40
instance.reset ();
53
- instance = { new Kernel{std::forward<Args>(args)...}, deleter };
41
+ Kernel *k = new Kernel{std::forward<Args>(args)...};
42
+ instance = { k, deleter };
54
43
}
55
44
return *static_cast <Kernel*>(instance.get ());
56
45
}
@@ -78,47 +67,36 @@ struct AnyKernelInstance {
78
67
* @brief Manages multiple instances of run-time typed kernels
79
68
*
80
69
* KernelManager provides type erasure for kernels whose type is selected at
81
- * run-time. Kernel manager also carries out mundane tasks of keeping
82
- * ScratchpadAllocators and reserving memory according to requirements returned
83
- * by kernel's Setup method.
84
- *
85
- * A scratchpad allocator is created per-thread with thread indexing supported
86
- * explicitly by the caller.
70
+ * run-time.
87
71
*/
88
72
class DLL_PUBLIC KernelManager {
89
73
public:
90
74
static constexpr size_t NumMemKinds = ScratchpadAllocator::NumMemKinds;
91
75
using ScratchSizes = std::array<size_t , NumMemKinds>;
92
76
93
77
/* *
94
- * @brief Creates `num_threads` scratchpads and ` num_instances` slots for kernels
78
+ * @brief Creates `num_instances` slots for kernels
95
79
*
96
- * @param num_threads - number of threads that can concurrently use the kernels in the
97
- * manager, assuming that each threads uses its unique
98
- * zero-based index
99
80
* @param num_instances - number of Kernel instances to be created; typically corresponds
100
81
* to number of samples (for per-sample kernels) or minibatches
101
82
*/
102
- void Resize (size_t num_threads, size_t num_instances);
83
+ void Resize (size_t num_instances) { instances. resize ( num_instances); }
103
84
104
85
/* *
105
- * @brief Creates `num_threads` scratchpads and `num_instances` kernels of type Kernel
106
- * constructed with `args...`.
86
+ * @brief Creates `num_instances` kernels of type Kernel constructed with `args...`.
107
87
*
108
- * @param num_threads - number of threads that can concurrently use the kernels in the
109
- * manager, assuming that each threads uses its unique
110
- * zero-based index
111
88
* @param num_instances - number of Kernel instances to be created; typically corresponds
112
89
* to number of samples (for per-sample kernels) or minibatches
113
90
* @param args - arguments passed to Kernel's constructor upon creation.
114
91
* @tparam Kernel - type of the kernel to be created
115
92
*/
116
93
template <typename Kernel, typename ... Args>
117
- void Resize (size_t num_threads, size_t num_instances, const Args&... args) {
118
- Resize (num_threads, num_instances);
94
+ void Resize (size_t num_instances, const Args&... args) {
95
+ Resize (num_instances);
119
96
Initialize<Kernel>(args...);
120
97
}
121
98
99
+
122
100
/* *
123
101
* @brief Populates the instance slots with instances of a given Kernel
124
102
*
@@ -132,9 +110,11 @@ class DLL_PUBLIC KernelManager {
132
110
}
133
111
134
112
/* *
135
- * @brief Clears kernel instances and scratchpads
113
+ * @brief Clears kernel instances
136
114
*/
137
- void Reset ();
115
+ void Reset () {
116
+ instances.clear ();
117
+ }
138
118
139
119
/* *
140
120
* @brief Gets or creates a Kernel instance
@@ -172,14 +152,6 @@ class DLL_PUBLIC KernelManager {
172
152
}
173
153
174
154
size_t NumInstances () const noexcept { return instances.size (); }
175
- size_t NumThreads () const noexcept { return scratchpads.size (); }
176
-
177
- /* *
178
- * @brief Gets a scratchpad allocator assigned to a given thread.
179
- */
180
- ScratchpadAllocator &GetScratchpadAllocator (int thread_idx) {
181
- return scratchpads[thread_idx];
182
- }
183
155
184
156
/* *
185
157
* @brief Calls setup on specified kernel instance.
@@ -190,130 +162,44 @@ class DLL_PUBLIC KernelManager {
190
162
* * should contain valid CUDA stream for GPU kernels;
191
163
* @param in_args - pack of arguments (inputs, arguments) used in Kernel::Setup
192
164
* @return Reference to internally maintained copy of the kernel requirements.
193
- * @remarks The copies of KernelRequirements for each instance index are used for allocating
194
- * scratch memory. While the function returns non-const reference, please note
195
- * that decreasing scratch sizes calculated by Setup will result in undefined
196
- * behavior, including memory corruption or illegal access.
197
165
*/
198
166
template <typename Kernel, typename ... InArgs>
199
167
KernelRequirements &Setup (int instance_idx, KernelContext &context, InArgs &&...in_args) {
200
168
auto &inst = instances[instance_idx];
201
169
inst.requirements = inst.get <Kernel>().Setup (context, std::forward<InArgs>(in_args)...);
202
- for (size_t i = 0 ; i < max_scratch_sizes.size (); i++) {
203
- atomic_max (max_scratch_sizes[i], inst.requirements .scratch_sizes [i]);
204
- }
205
170
return inst.requirements ;
206
171
}
207
172
208
173
/* *
209
- * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
210
- *
211
- * @param thread_idx - zero-based thread index
212
- * @param instance_idx - kernel instance index; typically corresponds
213
- * to sample index (for per-sample kernels) or minibatch index
214
- * @param context - context for the kernel
215
- * * should contain valid CUDA stream for GPU kernels;
216
- * * scratchpad pointer is overriden with a scratchpad
217
- * created for given thread_idx
218
- * @param out_in_args - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
219
- */
220
- template <typename Kernel, typename ... OutInArgs>
221
- void Run (int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
222
- assert (instance_idx >= 0 &&
223
- static_cast <size_t >(instance_idx) < NumInstances () &&
224
- " Kernel instance index (instance_idx) out of range" );
225
- auto &inst = instances[instance_idx];
226
- DynamicScratchpad scratchpad ({}, AccessOrder (context.gpu .stream ));
227
- auto *old_scratchpad = context.scratchpad ;
228
- context.scratchpad = &scratchpad;
229
- inst.get <Kernel>().Run (context, std::forward<OutInArgs>(out_in_args)...);
230
- context.scratchpad = old_scratchpad;
231
- }
232
-
233
- /* *
234
- * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
174
+ * @brief Calls Run on specified kernel instance
235
175
*
236
- * @param sa - scratchpad allocator; memory will be reserved in it to satisfy
237
- * instance's requirements
238
176
* @param instance_idx - kernel instance index; typically corresponds
239
177
* to sample index (for per-sample kernels) or minibatch index
240
178
* @param context - context for the kernel
241
179
* * should contain valid CUDA stream for GPU kernels;
242
- * * scratchpad pointer is overriden with a scratchpad
243
- * created from `sa`
180
+ * * if scratchpad pointer is null, a temporary dynamic scratchpad is
181
+ * created
244
182
* @param out_in_args - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
245
183
*/
246
184
template <typename Kernel, typename ... OutInArgs>
247
- void Run (ScratchpadAllocator &sa,
248
- int instance_idx,
249
- KernelContext &context,
250
- OutInArgs &&...out_in_args) {
185
+ void Run (int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
251
186
assert (instance_idx >= 0 &&
252
187
static_cast <size_t >(instance_idx) < NumInstances () &&
253
188
" Kernel instance index (instance_idx) out of range" );
254
189
auto &inst = instances[instance_idx];
255
- auto scratchpad = ReserveScratchpad (sa, inst.requirements .scratch_sizes );
256
- auto *old_scratchpad = context.scratchpad ;
257
- context.scratchpad = &scratchpad;
258
- inst.get <Kernel>().Run (context, std::forward<OutInArgs>(out_in_args)...);
259
- context.scratchpad = old_scratchpad;
260
- }
261
-
262
- /* *
263
- * @brief Makes sure ScratchpadAllocator can accommodate `sizes`
264
- *
265
- * @param sa - scratchpad allocator to reserve
266
- * @param sizes - requested minimum size
267
- *
268
- * The manager maintains a lifetime maximum of sizes requested.
269
- * If reallocation is necessary, it allocates `sizes` or that maximum
270
- * whichever is larger.
271
- */
272
- auto ReserveScratchpad (ScratchpadAllocator &sa, const ScratchSizes &sizes)->
273
- decltype(sa.GetScratchpad());
274
-
275
- /* *
276
- * @brief Calls ReserveScratchpad on ScratchpadAllocator associated with given thread_idx
277
- */
278
- inline auto ReserveScratchpad (int thread_idx, const ScratchSizes &sizes) {
279
- return ReserveScratchpad (GetScratchpadAllocator (thread_idx), sizes);
280
- }
281
-
282
- /* *
283
- * @brief Returns maximum scratchpad size seen so far
284
- */
285
- inline ScratchSizes MaxScratchSizes () const {
286
- ScratchSizes sizes;
287
- for (size_t i = 0 ; i < sizes.size (); i++) {
288
- sizes[i] = max_scratch_sizes[i];
190
+ if (!context.scratchpad ) {
191
+ DynamicScratchpad scratchpad ({}, AccessOrder (context.gpu .stream ));
192
+ auto *old_scratchpad = context.scratchpad ;
193
+ context.scratchpad = &scratchpad;
194
+ inst.get <Kernel>().Run (context, std::forward<OutInArgs>(out_in_args)...);
195
+ context.scratchpad = old_scratchpad;
196
+ } else {
197
+ inst.get <Kernel>().Run (context, std::forward<OutInArgs>(out_in_args)...);
289
198
}
290
- return sizes;
291
- }
292
-
293
- /* *
294
- * @brief Reserves scratchpad big enough to accommodate largest scratch area ever seen
295
- */
296
- inline auto ReserveMaxScratchpad (int thread_idx) {
297
- return ReserveScratchpad (thread_idx, MaxScratchSizes ());
298
- }
299
-
300
- /* *
301
- * @brief Sets a memory size hint for allocating scratchpad memory
302
- *
303
- * All calls to ScratchpadAllocator::Reserve followint this call will request at least
304
- * bytes memory for given allocation type.
305
- */
306
- template <typename MemoryKind>
307
- void SetMemoryHint (size_t bytes) {
308
- size_t alloc_idx = static_cast <size_t >(mm::kind2id_v<MemoryKind>);
309
- assert (alloc_idx < max_scratch_sizes.size ());
310
- atomic_max (max_scratch_sizes[alloc_idx], bytes);
311
199
}
312
200
313
201
private:
314
202
SmallVector<AnyKernelInstance, 1 > instances;
315
- SmallVector<ScratchpadAllocator, 1 > scratchpads;
316
- std::array<std::atomic_size_t , NumMemKinds> max_scratch_sizes{};
317
203
};
318
204
319
205
} // namespace kernels
0 commit comments