diff --git a/src/core/prefault.hh b/src/core/prefault.hh index c84685e1954..a12f38a6c55 100644 --- a/src/core/prefault.hh +++ b/src/core/prefault.hh @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -40,7 +41,7 @@ public: explicit memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout); ~memory_prefaulter(); private: - void work(std::vector& ranges, size_t page_size); + void work(std::vector& ranges, size_t page_size, std::optional huge_page_size_opt); }; diff --git a/src/core/smp.cc b/src/core/smp.cc index c43b4919cfa..eb01d2718ac 100644 --- a/src/core/smp.cc +++ b/src/core/smp.cc @@ -25,6 +25,10 @@ module; #include #include #include +#include +#include +#include +#include #ifdef SEASTAR_MODULE module seastar; @@ -35,6 +39,8 @@ module seastar; #include #include #include +#include +#include #include "prefault.hh" #endif @@ -173,11 +179,32 @@ smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory:: #endif } +static +std::optional +get_huge_page_size() { + auto meminfo_fd = file_desc::open("/proc/meminfo", O_RDONLY | O_CLOEXEC); + std::string meminfo; + char buf[4096]; + while (auto size_opt = meminfo_fd.read(buf, sizeof(buf))) { + if (!*size_opt) { + break; + } + meminfo.append(buf, *size_opt); + } + static std::regex re(R"(Hugepagesize:\s*(\d+) kB)"); + auto m = std::smatch{}; + if (std::regex_search(meminfo, m, re)) { + return std::stoi(m[1]) * size_t(1024); + } + return std::nullopt; +} + internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout) { for (auto& range : layout.ranges) { _layout_by_node_id[range.numa_node_id].push_back(std::move(range)); } auto page_size = getpagesize(); + auto huge_page_size_opt = get_huge_page_size(); for (auto& numa_node_id_and_ranges : _layout_by_node_id) { auto& numa_node_id = numa_node_id_and_ranges.first; auto& ranges = numa_node_id_and_ranges.second; @@ -191,8 +218,8 @@ internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, m } a.set(cpuset); } - _worker_threads.emplace_back(a, [this, &ranges, page_size] { - work(ranges, page_size); + _worker_threads.emplace_back(a, [this, &ranges, page_size, huge_page_size_opt] { + work(ranges, page_size, huge_page_size_opt); }); } } @@ -205,12 +232,33 @@ internal::memory_prefaulter::~memory_prefaulter() { } void -internal::memory_prefaulter::work(std::vector& ranges, size_t page_size) { +internal::memory_prefaulter::work(std::vector& ranges, size_t page_size, + std::optional huge_page_size_opt) { sched_param param = { .sched_priority = 0 }; // SCHED_IDLE doesn't work via thread attributes pthread_setschedparam(pthread_self(), SCHED_IDLE, ¶m); size_t current_range = 0; - const size_t batch_size = 512; // happens to match huge page size on x86, through not critical + const size_t batch_size = huge_page_size_opt.value_or(512*4096); + auto populate_memory_madvise = [works = true] (char* start, char* end) mutable { +#ifdef MADV_POPULATE_WRITE + if (works) { + auto result = madvise(start, end - start, MADV_POPULATE_WRITE); + if (result == -1 && errno == EINVAL) { + // Unsupported by kernel + works = false; + return false; + } + // Ignore other errors. This is just an optimization anyway. +#ifdef MADV_COLLAPSE + madvise(start, end - start, MADV_COLLAPSE); + // Also ignore problems with MADV_COLLAPSE, it's just an optimization. +#endif + return true; + }; +#endif + (void)works; // suppress warning if block above is elided + return false; + }; auto fault_in_memory = [] (char* p) { // Touch the page for write, but be sure not to modify anything // The compilers tend to optimize things, so prefer assembly @@ -227,25 +275,29 @@ internal::memory_prefaulter::work(std::vector& r p1->fetch_or(0, std::memory_order_relaxed); #endif }; - while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) { - auto& range = ranges[current_range]; - // copy eveything into locals, or the optimizer will worry about them due - // to the cast below and not optimize anything - auto start = range.start; - auto end = range.end; - for (size_t i = 0; i < batch_size && start < end; ++i) { + auto populate_memory = [&] (char* start, char* end) { + if (populate_memory_madvise(start, end)) { + return; + } + while (start < end) { fault_in_memory(start); start += page_size; } + }; + while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) { + auto& range = ranges[current_range]; + + auto batch_end = std::min(align_up(range.start + 1, batch_size), range.end); + populate_memory(range.start, batch_end); + range.start = batch_end; + // An end-to-start scan for applications that manage two heaps that // grow towards each other. - for (size_t i = 0; i < batch_size && start < end; ++i) { - fault_in_memory(end - 1); - end -= page_size; - } - range.start = start; - range.end = end; - if (start >= end) { + auto batch_start = std::max(align_down(range.end - 1, batch_size), range.start); + populate_memory(batch_start, range.end); + range.end = batch_start; + + if (range.start >= range.end) { ranges.erase(ranges.begin() + current_range); current_range = 0; }