Skip to content

Commit bf16cc8

Browse files
authored
Dynamic & stream-aware scratchpad (#3667)
* Fix monotonic resource with 0 initial size. * Add dynamic scratchpad with tests and benchmarks. * Add fixed_order_memory_resource - a wrapper which exposes a streamless interface for stream-ordered resources Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
1 parent aff9c5b commit bf16cc8

File tree

4 files changed

+438
-16
lines changed

4 files changed

+438
-16
lines changed

dali/kernels/dynamic_scratchpad.h

+179
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#ifndef DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
16+
#define DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
17+
18+
#include <array>
19+
#include <cassert>
20+
#include <tuple>
21+
#include <type_traits>
22+
#include <utility>
23+
#include "dali/core/static_switch.h"
24+
#include "dali/core/mm/fixed_order_resource.h"
25+
#include "dali/core/mm/memory.h"
26+
#include "dali/core/mm/memory_kind.h"
27+
#include "dali/core/mm/monotonic_resource.h"
28+
#include "dali/kernels/context.h"
29+
#include "dali/kernels/kernel_req.h"
30+
31+
namespace dali {
32+
namespace kernels {
33+
34+
namespace detail {
35+
36+
template <typename T, typename... Ts>
37+
struct index_in_pack;
38+
39+
template <typename T, typename... Ts>
40+
struct index_in_pack<T, T, Ts...> : std::integral_constant<int, 0> {};
41+
42+
template <typename T, typename U, typename... Ts>
43+
struct index_in_pack<T, U, Ts...> :
44+
std::integral_constant<int, index_in_pack<T, Ts...>::value + 1> {};
45+
46+
/**
47+
* @brief Implements upstream handling and ordered wrappers.
48+
*/
49+
template <typename... Kinds>
50+
class DynamicScratchpadImplT {
51+
protected:
52+
template <typename Kind>
53+
void set_upstream_resource(mm::memory_resource<Kind> *rsrc) {
54+
resource<Kind>() = mm::monotonic_memory_resource<Kind>(rsrc, initial_size<Kind>());
55+
}
56+
57+
template <typename Kind>
58+
void set_upstream_resource(mm::async_memory_resource<Kind> *rsrc,
59+
AccessOrder alloc_order,
60+
AccessOrder dealloc_order = {}) {
61+
static_assert(!std::is_same<Kind, mm::memory_kind::host>::value,
62+
"Cannot use a stream-ordered resource for plain host memory");
63+
adapter<Kind>() = { rsrc, alloc_order, dealloc_order };
64+
set_upstream_resource<Kind>(&adapter<Kind>());
65+
}
66+
67+
template <typename Kind>
68+
size_t &initial_size() {
69+
return initial_sizes_[index_in_pack<Kind, Kinds...>::value];
70+
}
71+
72+
template <typename Kind>
73+
size_t initial_size() const {
74+
return initial_sizes_[index_in_pack<Kind, Kinds...>::value];
75+
}
76+
77+
template <typename Kind>
78+
mm::memory_resource<Kind> *get_upstream() const {
79+
std::get<mm::monotonic_memory_resource<Kind>>(resources_)->get_upstream();
80+
}
81+
82+
template <typename Kind>
83+
auto &adapter() {
84+
return std::get<mm::fixed_order_resource<Kind>>(adapters_);
85+
}
86+
87+
template <typename Kind>
88+
auto &adapter() const {
89+
return std::get<mm::fixed_order_resource<Kind>>(adapters_);
90+
}
91+
92+
template <typename Kind>
93+
auto &resource() {
94+
return std::get<mm::monotonic_memory_resource<Kind>>(resources_);
95+
}
96+
97+
template <typename Kind>
98+
auto &resource() const {
99+
return std::get<mm::monotonic_memory_resource<Kind>>(resources_);
100+
}
101+
102+
std::tuple<mm::fixed_order_resource<Kinds>...> adapters_;
103+
std::tuple<mm::monotonic_memory_resource<Kinds>...> resources_;
104+
std::array<size_t, sizeof...(Kinds)> initial_sizes_ = {};
105+
};
106+
107+
using DynamicScratchpadImpl = DynamicScratchpadImplT<
108+
mm::memory_kind::host,
109+
mm::memory_kind::pinned,
110+
mm::memory_kind::device,
111+
mm::memory_kind::managed>;
112+
113+
} // namespace detail
114+
115+
class DynamicScratchpad
116+
: public Scratchpad
117+
, private detail::DynamicScratchpadImpl {
118+
public:
119+
/**
120+
* @brief Constructs a dynamically allocated scratchpad
121+
*
122+
* @param initial_sizes Sizes, in bytes, of the initial buffers. Note that these buffers
123+
* are allocated lazily, so nothing is allocated if there's no request
124+
* for memory of any given kind.
125+
* @param device_order Allocation and deallocation order for device memory.
126+
* @param pinned_dealloc_order Deallocation order for pinned memory. Allocation is always
127+
* host-ordered. If not set, device_order is used.
128+
* @param managed_dealloc_order Deallocation order for managed memory. Allocation is always
129+
* host-ordered. If not set, device_order is used.
130+
*/
131+
explicit DynamicScratchpad(scratch_sizes_t initial_sizes = {},
132+
AccessOrder device_order = cudaStream_t(0),
133+
AccessOrder pinned_dealloc_order = {},
134+
AccessOrder managed_dealloc_order = {}) {
135+
initial_sizes_ = initial_sizes;
136+
for (auto &s : initial_sizes_) {
137+
if (s == 0)
138+
s = 4096;
139+
}
140+
if (!pinned_dealloc_order.has_value())
141+
pinned_dealloc_order = device_order;
142+
if (!managed_dealloc_order.has_value())
143+
managed_dealloc_order = device_order;
144+
145+
set_upstream_resource<mm::memory_kind::host>(mm::GetDefaultResource<mm::memory_kind::host>());
146+
147+
set_upstream_resource<mm::memory_kind::pinned>(
148+
mm::GetDefaultResource<mm::memory_kind::pinned>(),
149+
AccessOrder::host(),
150+
pinned_dealloc_order);
151+
152+
set_upstream_resource<mm::memory_kind::device>(
153+
mm::GetDefaultResource<mm::memory_kind::device>(),
154+
device_order);
155+
156+
set_upstream_resource<mm::memory_kind::managed>(
157+
mm::GetDefaultResource<mm::memory_kind::managed>(),
158+
AccessOrder::host(),
159+
managed_dealloc_order);
160+
}
161+
162+
virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
163+
void *ret = nullptr;
164+
TYPE_SWITCH(kind_id, mm::kind2id, Kind,
165+
(mm::memory_kind::host,
166+
mm::memory_kind::pinned,
167+
mm::memory_kind::device,
168+
mm::memory_kind::managed),
169+
(ret = resource<Kind>().allocate(bytes, alignment)),
170+
(assert(!"Incorrect memory kind id");));
171+
return ret;
172+
}
173+
};
174+
175+
} // namespace kernels
176+
} // namespace dali
177+
178+
#endif // DALI_KERNELS_DYNAMIC_SCRATCHPAD_H_
179+
+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "dali/kernels/dynamic_scratchpad.h" // NOLINT
16+
#include <gtest/gtest.h>
17+
#include <algorithm>
18+
#include <iostream>
19+
#include <chrono>
20+
#include <random>
21+
#include <string>
22+
#include <vector>
23+
#include "dali/core/cuda_utils.h"
24+
#include "dali/core/cuda_stream_pool.h"
25+
#include "dali/core/mm/memory.h"
26+
27+
namespace dali {
28+
namespace kernels {
29+
namespace test {
30+
31+
/**
32+
* @brief Tests basic dynamic scratchpad functioning
33+
*
34+
* This test checks that:
35+
* - the memory is usable and accessible on the right backend
36+
* - the pinned memory block is released in stream order, which prevents
37+
* immediate reuse if the stream is still running
38+
* - it makes multiple attempts to catch the stream still running
39+
*/
40+
TEST(DynamicScratchpad, BasicTest) {
41+
const int N = 64 << 10; // 64 KiB
42+
43+
std::vector<char> in(N);
44+
for (int i = 0; i < N; i++)
45+
in[i] = i + 42; // so it doesn't start or end with 0
46+
47+
auto stream = CUDAStreamPool::instance().Get();
48+
auto dev = mm::alloc_raw_unique<char, mm::memory_kind::device>(N);
49+
int max_attempts = 1000;
50+
bool was_running = false;
51+
for (int attempt = 0; attempt < max_attempts; attempt++) {
52+
char *pinned;
53+
{
54+
DynamicScratchpad scratch({}, AccessOrder(stream));
55+
pinned = scratch.Allocate<mm::memory_kind::pinned, char>(N);
56+
memcpy(pinned, in.data(), N);
57+
CUDA_CALL(cudaMemcpyAsync(dev.get(), pinned, N, cudaMemcpyHostToDevice, stream));
58+
}
59+
auto out = mm::alloc_raw_unique<char, mm::memory_kind::pinned>(N);
60+
bool running = false;
61+
if (was_running) {
62+
CUDA_CALL(cudaStreamSynchronize(stream));
63+
} else {
64+
running = cudaStreamQuery(stream) == cudaErrorNotReady;
65+
if (running)
66+
was_running = true;
67+
}
68+
ASSERT_TRUE(out.get() + N < pinned || out.get() >= pinned + N || !running);
69+
CUDA_CALL(cudaMemcpyAsync(out.get(), dev.get(), N, cudaMemcpyDeviceToHost, stream));
70+
CUDA_CALL(cudaStreamSynchronize(stream));
71+
ASSERT_EQ(memcmp(in.data(), out.get(), N), 0);
72+
if (was_running && !running)
73+
break;
74+
}
75+
if (!was_running)
76+
std::cerr << "Warning: Test incomplete - the stream was never caught still running"
77+
<< std::endl;
78+
}
79+
80+
inline void ProcessResults(vector<double> &times, const string &header) {
81+
std::sort(times.begin(), times.end());
82+
double sum = std::accumulate(times.begin(), times.end(), 0);
83+
auto b98 = times.begin() + times.size()/100;
84+
auto e98 = times.end() - times.size()/100;
85+
double sum98 = std::accumulate(b98, e98, 0);
86+
std::cout << header << "\n"
87+
<< "Median time: " << times[times.size()/2] << " ns\n"
88+
<< "90th percentile: " << times[times.size()*90/100] << " ns\n"
89+
<< "99th percentile: " << times[times.size()*99/100] << " ns\n"
90+
<< "Mean time: " << sum/times.size() << " ns\n"
91+
<< "Mean time (middle 98%): " << sum98/(e98-b98) << " ns\n";
92+
}
93+
94+
TEST(DynamicScratchpad, Perf) {
95+
std::poisson_distribution size_dist(1024); // 1 KiB average
96+
int max_size = 64 << 20; // 64 MiB max
97+
std::uniform_int_distribution<> num_dist(1, 100);
98+
99+
std::mt19937_64 rng(1234);
100+
101+
auto stream1 = CUDAStreamPool::instance().Get();
102+
auto stream2 = CUDAStreamPool::instance().Get();
103+
cudaStream_t streams[] = { stream1, stream2 };
104+
105+
int max_attempts = 100000;
106+
107+
const int nkinds = static_cast<int>(mm::memory_kind_id::count);
108+
std::vector<double> alloc_times[nkinds];
109+
std::vector<double> destroy_times;
110+
for (auto &v : alloc_times)
111+
v.reserve(max_attempts*100);
112+
destroy_times.reserve(max_attempts);
113+
114+
for (int attempt = 0; attempt < max_attempts; attempt++) {
115+
auto s = streams[attempt % 2];
116+
std::aligned_storage_t<sizeof(DynamicScratchpad), alignof(DynamicScratchpad)> scratch_placement;
117+
auto *scratch = new(&scratch_placement) DynamicScratchpad({}, AccessOrder(s));
118+
for (int k = 0; k < nkinds; k++) {
119+
auto kind = static_cast<mm::memory_kind_id>(k);
120+
if (kind == mm::memory_kind_id::managed)
121+
continue;
122+
int n = num_dist(rng);
123+
for (int i = 0; i < n; i++) {
124+
size_t size = std::min(size_dist(rng), max_size);
125+
auto s = std::chrono::high_resolution_clock::now();
126+
scratch->Alloc(kind, size, alignof(std::max_align_t));
127+
auto e = std::chrono::high_resolution_clock::now();
128+
alloc_times[k].push_back((e-s).count());
129+
}
130+
}
131+
{
132+
auto s = std::chrono::high_resolution_clock::now();
133+
scratch->DynamicScratchpad::~DynamicScratchpad();
134+
auto e = std::chrono::high_resolution_clock::now();
135+
destroy_times.push_back((e-s).count());
136+
}
137+
}
138+
139+
const char *names[] = { "host", "pinned", "device", "managed" };
140+
141+
for (int k = 0; k < nkinds; k++) {
142+
if (k == mm::memory_kind_id::managed)
143+
continue;
144+
ProcessResults(alloc_times[k],
145+
make_string("Allocation performance for ", names[k], " memory"));
146+
}
147+
148+
ProcessResults(destroy_times, "Scratchpad destruction time");
149+
}
150+
151+
152+
} // namespace test
153+
} // namespace kernels
154+
} // namespace dali

0 commit comments

Comments
 (0)