diff --git a/sycl/test-e2e/Regression/queue_submitted_kernels_oom.cpp b/sycl/test-e2e/Regression/queue_submitted_kernels_oom.cpp new file mode 100644 index 0000000000000..55134415c07e5 --- /dev/null +++ b/sycl/test-e2e/Regression/queue_submitted_kernels_oom.cpp @@ -0,0 +1,153 @@ +// Tests for memory use of kernel submission. Should not grow +// unbounded even with thousands of kernel submissions. +// Only intended for the new L0v2 adapter. +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// REQUIRES: linux && level_zero_v2_adapter + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static long getRusageKbs() { + struct rusage r_usage; + if (getrusage(RUSAGE_SELF, &r_usage) == 0) { + return r_usage.ru_maxrss; + } + return -1; +} + +// There's some variability in memory usage on the various +// platforms when running kernels. +static constexpr long MarginKb = 400; + +static bool withinMargin(long base, long current) { + if (base < 0 || current < 0) + return false; + // theoretically, memory use can shrink after e.g., wait()... + long diff = (current > base) ? (current - base) : (base - current); + return diff <= MarginKb; +} + +static constexpr size_t UniqueKernels = 256; +static constexpr size_t ConsecutiveDupSubmissions = + 100000; // same kernel over and over +static constexpr size_t CyclicSubmissions = 100000; // cycle over small subset +static constexpr size_t CyclicSubset = 16; // cycle kernel subset +static constexpr size_t AllKernelsSubmissions = 100000; // running all kernel + +template struct KernelTag; + +template static void submitIncrement(sycl::queue &Q, int *accum) { + Q.submit([&](sycl::handler &CGH) { + CGH.single_task>([=]() { + // atomic_ref to avoid data races while we spam submissions. + sycl::atomic_ref + ref(accum[ID]); + ref.fetch_add(1); + }); + }); +} + +using SubmitFn = void (*)(sycl::queue &, int *); + +template +static auto makeFnTable(std::index_sequence) { + return std::array{&submitIncrement...}; +} + +int main() { + bool rusageUnsupported = getRusageKbs() == -1; + if (rusageUnsupported) { + return 1; // can't collect mem statistics, no point in running the test. + } + + sycl::queue Q; + + int *accum = sycl::malloc_shared(UniqueKernels, Q); + assert(accum && "USM alloc failed"); + for (std::size_t i = 0; i < UniqueKernels; ++i) + accum[i] = 0; + + std::vector expected(UniqueKernels, 0); + + auto fns = makeFnTable(std::make_index_sequence{}); + + // Submit the same kernel over and over again. The submitted kernel + // vector shouldn't grow at all, since we do a lookback over + // a few previous kernels. + auto runDuplicates = [&]() { + for (size_t i = 0; i < ConsecutiveDupSubmissions; ++i) { + fns[0](Q, accum); + expected[0]++; + } + }; + + // Run a small subset of kernels in a loop. Likely the most realistic + // scenario. Should be mostly absorbed by loopback duplicate search, and, + // possibliy, compaction. + auto runCyclical = [&]() { + for (size_t i = 0; i < CyclicSubmissions; ++i) { + size_t id = i % CyclicSubset; + fns[id](Q, accum); + expected[id]++; + } + }; + + // Run all kernels in the loop. Should dynamically adjust the + // threshold for submitted kernels. + auto runAll = [&]() { + for (size_t i = 0; i < AllKernelsSubmissions; ++i) { + size_t id = i % UniqueKernels; + fns[id](Q, accum); + expected[id]++; + } + }; + + runAll(); + Q.wait(); // first run all the kernels, just to get all the caches warm. + + long baseMemUsage = getRusageKbs(); + + // Run from small kernel variety, to large, to small, to test dynamic + // threshold changes. + runDuplicates(); + runCyclical(); + runAll(); + + long afterRampup = getRusageKbs(); + + assert(withinMargin(baseMemUsage, afterRampup)); + + Q.wait(); // this clears the submitted kernels list, allowing the threshold to + // lower. + runAll(); + runCyclical(); + runDuplicates(); + + long afterRampdown = getRusageKbs(); + assert(withinMargin(baseMemUsage, afterRampdown)); + + Q.wait(); // this clears vector again. But memory usage should stay the same. + long afterCleanup = getRusageKbs(); + assert(withinMargin(baseMemUsage, afterCleanup)); + + int ret = 0; + for (std::size_t i = 0; i < UniqueKernels; ++i) { + if (static_cast(accum[i]) != expected[i]) { + ret = 0; + std::cout << "fail: " << accum[i] << " != " << expected[i] << "\n"; + } + } + + sycl::free(accum, Q); + return ret; +} diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 94161a46edce8..89d694288e9b2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -1059,8 +1059,56 @@ ur_result_t ur_command_list_manager::appendNativeCommandExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +void ur_command_list_manager::compactSubmittedKernels() { + size_t beforeSize = submittedKernels.size(); + + std::sort(submittedKernels.begin(), submittedKernels.end()); + + // Remove all but one unique entry for each kernel. All removed entries + // need to have their refcounts decremented. + auto newEnd = std::unique( + submittedKernels.begin(), submittedKernels.end(), [](auto lhs, auto rhs) { + if (lhs == rhs) { + [[maybe_unused]] const bool lastEntry = rhs->RefCount.release(); + assert(!lastEntry); // there should be at least one entry left. + return true; // duplicate. + } + return false; + }); + + submittedKernels.erase(newEnd, submittedKernels.end()); + + // Adjust compaction threshold. + size_t removed = beforeSize - submittedKernels.size(); + size_t removedPct = beforeSize > 0 ? (removed * 100) / beforeSize : 0; + if (removedPct > 75) { + // We removed a lot of entries. Lower the threshold if possible. + compactionThreshold = std::max( + SUBMITTED_KERNELS_DEFAULT_THRESHOLD, compactionThreshold / 2); + } else if (removedPct < 10 && + compactionThreshold < SUBMITTED_KERNELS_MAX_THRESHOLD) { + // Increase the threshold if we removed very little entries. This means + // there are many unique kernels, and we need to allow the vector to grow + // more. + compactionThreshold *= 2; + } +} + void ur_command_list_manager::recordSubmittedKernel( ur_kernel_handle_t hKernel) { + bool isDuplicate = std::any_of( + submittedKernels.end() - + std::min(SUBMITTED_KERNELS_DUPE_CHECK_DEPTH, submittedKernels.size()), + submittedKernels.end(), [hKernel](auto k) { return k == hKernel; }); + + if (isDuplicate) { + return; + } + + if (submittedKernels.size() > compactionThreshold) { + compactSubmittedKernels(); + } + submittedKernels.push_back(hKernel); hKernel->RefCount.retain(); } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index e9fa6ac978ef5..5131d53dccbc0 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -45,6 +45,24 @@ struct wait_list_view { } }; +// When recording submitted kernels, we only care about unique kernels. It's not +// important whether the kernel has been submitted to the kernel just once or +// dozens of times. The number of unique kernels should be fairly low. +// So, in order to reduce the number of entries in the submitted kernels vector, +// we do a lookback at 4 previous entries (to try to keep within a cacheline), +// and don't record a new kernel if it exists. +static const size_t SUBMITTED_KERNELS_DUPE_CHECK_DEPTH = 4; + +// In scenarios where queue synchronization happens rarely, the submitted kernel +// vector can grow unbounded. In order to avoid that, we go through the entire +// vector, eliminating any duplicates. +static const size_t SUBMITTED_KERNELS_DEFAULT_THRESHOLD = 128; + +// If we reach this many unique kernels, the application is probably doing +// something incorrectly. The adapter will still function, just that compaction +// will happen more frequently. +static const size_t SUBMITTED_KERNELS_MAX_THRESHOLD = 65536; + struct ur_command_list_manager { ur_command_list_manager(ur_context_handle_t context, ur_device_handle_t device, @@ -254,6 +272,7 @@ struct ur_command_list_manager { ur_command_t callerCommand); void recordSubmittedKernel(ur_kernel_handle_t hKernel); + void compactSubmittedKernels(); ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType); @@ -299,6 +318,8 @@ struct ur_command_list_manager { v2::raii::ur_device_handle_t hDevice; std::vector submittedKernels; + std::size_t compactionThreshold = SUBMITTED_KERNELS_DEFAULT_THRESHOLD; + v2::raii::command_list_unique_handle zeCommandList; std::vector waitList; };