Skip to content

Commit fbd849e

Browse files
committed
[L0v2] add submitted kernel vector compaction
L0v2 avoids internally tracking each kernel submission through an event for lifetime management. Instead, when a kernel is submitted to the queue, its handle is added to a vector, to be removed at the next queue synchronization point, urQueueFinish(). This is a much more efficient way of handling kernel tracking, since it avoids taking and storing an event. However, if the application never synchronizes the queue, this vector of submitted kernels will grow unbounded. This patch avoids this problem by dynamically compacting the submitted kernel vector at set intervals, deduplicating identical kernel handles. The larger the amount of unique kernels, the larger the vector will be.
1 parent 8c614ad commit fbd849e

File tree

3 files changed

+222
-0
lines changed

3 files changed

+222
-0
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Tests for memory use of kernel submission. Should not grow
2+
// unbounded even with thousands of kernel submissions.
3+
// Only intended for the new L0v2 adapter.
4+
// RUN: %{build} -o %t.out
5+
// RUN: %{run} %t.out
6+
// REQUIRES: linux && level_zero_v2_adapter
7+
8+
#include <array>
9+
#include <cassert>
10+
#include <cstdint>
11+
#include <sys/resource.h>
12+
#include <thread>
13+
#include <vector>
14+
15+
#include <sycl/atomic_ref.hpp>
16+
#include <sycl/detail/core.hpp>
17+
#include <sycl/usm.hpp>
18+
19+
static long getRusageKbs() {
20+
struct rusage r_usage;
21+
if (getrusage(RUSAGE_SELF, &r_usage) == 0) {
22+
return r_usage.ru_maxrss;
23+
}
24+
return -1;
25+
}
26+
27+
// There's some variability in memory usage on the various
28+
// platforms when running kernels.
29+
static constexpr long MarginKb = 400;
30+
31+
static bool withinMargin(long base, long current) {
32+
if (base < 0 || current < 0)
33+
return false;
34+
// theoretically, memory use can shrink after e.g., wait()...
35+
long diff = (current > base) ? (current - base) : (base - current);
36+
return diff <= MarginKb;
37+
}
38+
39+
static constexpr size_t UniqueKernels = 256;
40+
static constexpr size_t ConsecutiveDupSubmissions =
41+
100000; // same kernel over and over
42+
static constexpr size_t CyclicSubmissions = 100000; // cycle over small subset
43+
static constexpr size_t CyclicSubset = 16; // cycle kernel subset
44+
static constexpr size_t AllKernelsSubmissions = 100000; // running all kernel
45+
46+
template <size_t ID> struct KernelTag;
47+
48+
template <size_t ID> static void submitIncrement(sycl::queue &Q, int *accum) {
49+
Q.submit([&](sycl::handler &CGH) {
50+
CGH.single_task<KernelTag<ID>>([=]() {
51+
// atomic_ref to avoid data races while we spam submissions.
52+
sycl::atomic_ref<int, sycl::memory_order::relaxed,
53+
sycl::memory_scope::device>
54+
ref(accum[ID]);
55+
ref.fetch_add(1);
56+
});
57+
});
58+
}
59+
60+
using SubmitFn = void (*)(sycl::queue &, int *);
61+
62+
template <std::size_t... Is>
63+
static auto makeFnTable(std::index_sequence<Is...>) {
64+
return std::array<SubmitFn, UniqueKernels>{&submitIncrement<Is>...};
65+
}
66+
67+
int main() {
68+
bool rusageUnsupported = getRusageKbs() == -1;
69+
if (rusageUnsupported) {
70+
return 1; // can't collect mem statistics, no point in running the test.
71+
}
72+
73+
sycl::queue Q;
74+
75+
int *accum = sycl::malloc_shared<int>(UniqueKernels, Q);
76+
assert(accum && "USM alloc failed");
77+
for (std::size_t i = 0; i < UniqueKernels; ++i)
78+
accum[i] = 0;
79+
80+
std::vector<std::size_t> expected(UniqueKernels, 0);
81+
82+
auto fns = makeFnTable(std::make_index_sequence<UniqueKernels>{});
83+
84+
// Submit the same kernel over and over again. The submitted kernel
85+
// vector shouldn't grow at all, since we do a lookback over
86+
// a few previous kernels.
87+
auto runDuplicates = [&]() {
88+
for (size_t i = 0; i < ConsecutiveDupSubmissions; ++i) {
89+
fns[0](Q, accum);
90+
expected[0]++;
91+
}
92+
};
93+
94+
// Run a small subset of kernels in a loop. Likely the most realistic
95+
// scenario. Should be mostly absorbed by loopback duplicate search, and,
96+
// possibliy, compaction.
97+
auto runCyclical = [&]() {
98+
for (size_t i = 0; i < CyclicSubmissions; ++i) {
99+
size_t id = i % CyclicSubset;
100+
fns[id](Q, accum);
101+
expected[id]++;
102+
}
103+
};
104+
105+
// Run all kernels in the loop. Should dynamically adjust the
106+
// threshold for submitted kernels.
107+
auto runAll = [&]() {
108+
for (size_t i = 0; i < AllKernelsSubmissions; ++i) {
109+
size_t id = i % UniqueKernels;
110+
fns[id](Q, accum);
111+
expected[id]++;
112+
}
113+
};
114+
115+
runAll();
116+
Q.wait(); // first run all the kernels, just to get all the caches warm.
117+
118+
long baseMemUsage = getRusageKbs();
119+
120+
// Run from small kernel variety, to large, to small, to test dynamic
121+
// threshold changes.
122+
runDuplicates();
123+
runCyclical();
124+
runAll();
125+
126+
long afterRampup = getRusageKbs();
127+
128+
assert(withinMargin(baseMemUsage, afterRampup));
129+
130+
Q.wait(); // this clears the submitted kernels list, allowing the threshold to
131+
// lower.
132+
runAll();
133+
runCyclical();
134+
runDuplicates();
135+
136+
long afterRampdown = getRusageKbs();
137+
assert(withinMargin(baseMemUsage, afterRampdown));
138+
139+
Q.wait(); // this clears vector again. But memory usage should stay the same.
140+
long afterCleanup = getRusageKbs();
141+
assert(withinMargin(baseMemUsage, afterCleanup));
142+
143+
int ret = 0;
144+
for (std::size_t i = 0; i < UniqueKernels; ++i) {
145+
if (static_cast<std::size_t>(accum[i]) != expected[i]) {
146+
ret = 0;
147+
std::cout << "fail: " << accum[i] << " != " << expected[i] << "\n";
148+
}
149+
}
150+
151+
sycl::free(accum, Q);
152+
return ret;
153+
}

unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,8 +1059,56 @@ ur_result_t ur_command_list_manager::appendNativeCommandExp(
10591059
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
10601060
}
10611061

1062+
void ur_command_list_manager::compactSubmittedKernels() {
1063+
size_t beforeSize = submittedKernels.size();
1064+
1065+
std::sort(submittedKernels.begin(), submittedKernels.end());
1066+
1067+
// Remove all but one unique entry for each kernel. All removed entries
1068+
// need to have their refcounts decremented.
1069+
auto newEnd = std::unique(
1070+
submittedKernels.begin(), submittedKernels.end(), [](auto lhs, auto rhs) {
1071+
if (lhs == rhs) {
1072+
[[maybe_unused]] const bool lastEntry = rhs->RefCount.release();
1073+
assert(!lastEntry); // there should be at least one entry left.
1074+
return true; // duplicate.
1075+
}
1076+
return false;
1077+
});
1078+
1079+
submittedKernels.erase(newEnd, submittedKernels.end());
1080+
1081+
// Adjust compaction threshold.
1082+
size_t removed = beforeSize - submittedKernels.size();
1083+
size_t removedPct = beforeSize > 0 ? (removed * 100) / beforeSize : 0;
1084+
if (removedPct > 75) {
1085+
// We removed a lot of entries. Lower the threshold if possible.
1086+
compactionThreshold = std::max<std::size_t>(
1087+
SUBMITTED_KERNELS_DEFAULT_THRESHOLD, compactionThreshold / 2);
1088+
} else if (removedPct < 10 &&
1089+
compactionThreshold < SUBMITTED_KERNELS_MAX_THRESHOLD) {
1090+
// Increase the threshold if we removed very little entries. This means
1091+
// there are many unique kernels, and we need to allow the vector to grow
1092+
// more.
1093+
compactionThreshold *= 2;
1094+
}
1095+
}
1096+
10621097
void ur_command_list_manager::recordSubmittedKernel(
10631098
ur_kernel_handle_t hKernel) {
1099+
bool isDuplicate = std::any_of(
1100+
submittedKernels.end() -
1101+
std::min(SUBMITTED_KERNELS_DUPE_CHECK_DEPTH, submittedKernels.size()),
1102+
submittedKernels.end(), [hKernel](auto k) { return k == hKernel; });
1103+
1104+
if (isDuplicate) {
1105+
return;
1106+
}
1107+
1108+
if (submittedKernels.size() > compactionThreshold) {
1109+
compactSubmittedKernels();
1110+
}
1111+
10641112
submittedKernels.push_back(hKernel);
10651113
hKernel->RefCount.retain();
10661114
}

unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,24 @@ struct wait_list_view {
4545
}
4646
};
4747

48+
// When recording submitted kernels, we only care about unique kernels. It's not
49+
// important whether the kernel has been submitted to the kernel just once or
50+
// dozens of times. The number of unique kernels should be fairly low.
51+
// So, in order to reduce the number of entries in the submitted kernels vector,
52+
// we do a lookback at 4 previous entries (to try to keep within a cacheline),
53+
// and don't record a new kernel if it exists.
54+
static const size_t SUBMITTED_KERNELS_DUPE_CHECK_DEPTH = 4;
55+
56+
// In scenarios where queue synchronization happens rarely, the submitted kernel
57+
// vector can grow unbounded. In order to avoid that, we go through the entire
58+
// vector, eliminating any duplicates.
59+
static const size_t SUBMITTED_KERNELS_DEFAULT_THRESHOLD = 128;
60+
61+
// If we reach this many unique kernels, the application is probably doing
62+
// something incorrectly. The adapter will still function, just that compaction
63+
// will happen more frequently.
64+
static const size_t SUBMITTED_KERNELS_MAX_THRESHOLD = 65536;
65+
4866
struct ur_command_list_manager {
4967
ur_command_list_manager(ur_context_handle_t context,
5068
ur_device_handle_t device,
@@ -254,6 +272,7 @@ struct ur_command_list_manager {
254272
ur_command_t callerCommand);
255273

256274
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
275+
void compactSubmittedKernels();
257276

258277
ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent,
259278
ur_command_t commandType);
@@ -299,6 +318,8 @@ struct ur_command_list_manager {
299318
v2::raii::ur_device_handle_t hDevice;
300319

301320
std::vector<ur_kernel_handle_t> submittedKernels;
321+
std::size_t compactionThreshold = SUBMITTED_KERNELS_DEFAULT_THRESHOLD;
322+
302323
v2::raii::command_list_unique_handle zeCommandList;
303324
std::vector<ze_event_handle_t> waitList;
304325
};

0 commit comments

Comments
 (0)