From 329ea38b8cd5e71f518afdcfafd46955fdd6ed13 Mon Sep 17 00:00:00 2001 From: duke Date: Fri, 25 Aug 2023 12:33:31 +0000 Subject: [PATCH] Backport 84b325b844c08809448a9c073a11443d9e3c3f8e --- src/hotspot/os/linux/globals_linux.hpp | 9 + src/hotspot/os/linux/os_linux.cpp | 58 ++++- .../os/THPsInThreadStackPreventionTest.java | 241 ++++++++++++++++++ 3 files changed, 301 insertions(+), 7 deletions(-) create mode 100644 test/hotspot/jtreg/runtime/os/THPsInThreadStackPreventionTest.java diff --git a/src/hotspot/os/linux/globals_linux.hpp b/src/hotspot/os/linux/globals_linux.hpp index 91c14b207c8..a4201765eea 100644 --- a/src/hotspot/os/linux/globals_linux.hpp +++ b/src/hotspot/os/linux/globals_linux.hpp @@ -89,6 +89,15 @@ "to disable both the override and the printouts." \ "See prctl(PR_SET_TIMERSLACK) for more info.") \ \ + product(bool, DisableTHPStackMitigation, false, DIAGNOSTIC, \ + "If THPs are unconditionally enabled on the system (mode " \ + "\"always\"), the JVM will prevent THP from forming in " \ + "thread stacks. This switch disables that mitigation and " \ + "allows THPs to form in thread stacks.") \ + \ + develop(bool, DelayThreadStartALot, false, \ + "Artificially delay thread starts randomly for testing.") \ + \ // end of RUNTIME_OS_FLAGS diff --git a/src/hotspot/os/linux/os_linux.cpp b/src/hotspot/os/linux/os_linux.cpp index 1ea989281c2..9037e54fe61 100644 --- a/src/hotspot/os/linux/os_linux.cpp +++ b/src/hotspot/os/linux/os_linux.cpp @@ -775,6 +775,10 @@ static void *thread_native_entry(Thread *thread) { assert(osthread->pthread_id() != 0, "pthread_id was not set as expected"); + if (DelayThreadStartALot) { + os::naked_short_sleep(100); + } + // call one more level start routine thread->call_run(); @@ -911,6 +915,7 @@ bool os::create_thread(Thread* thread, ThreadType thr_type, // Calculate stack size if it's not specified by caller. size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size); size_t guard_size = os::Linux::default_guard_size(thr_type); + // Configure glibc guard page. Must happen before calling // get_static_tls_area_size(), which uses the guard_size. pthread_attr_setguardsize(&attr, guard_size); @@ -931,13 +936,16 @@ bool os::create_thread(Thread* thread, ThreadType thr_type, } assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned"); - // Add an additional page to the stack size to reduce its chances of getting large page aligned - // so that the stack does not get backed by a transparent huge page. - size_t default_large_page_size = HugePages::default_static_hugepage_size(); - if (default_large_page_size != 0 && - stack_size >= default_large_page_size && - is_aligned(stack_size, default_large_page_size)) { - stack_size += os::vm_page_size(); + if (!DisableTHPStackMitigation) { + // In addition to the glibc guard page that prevents inter-thread-stack hugepage + // coalescing (see comment in os::Linux::default_guard_size()), we also make + // sure the stack size itself is not huge-page-size aligned; that makes it much + // more likely for thread stack boundaries to be unaligned as well and hence + // protects thread stacks from being targeted by khugepaged. + if (HugePages::thp_pagesize() > 0 && + is_aligned(stack_size, HugePages::thp_pagesize())) { + stack_size += os::vm_page_size(); + } } int status = pthread_attr_setstacksize(&attr, stack_size); @@ -3077,6 +3085,27 @@ bool os::Linux::libnuma_init() { } size_t os::Linux::default_guard_size(os::ThreadType thr_type) { + + if (!DisableTHPStackMitigation) { + // If THPs are unconditionally enabled, the following scenario can lead to huge RSS + // - parent thread spawns, in quick succession, multiple child threads + // - child threads are slow to start + // - thread stacks of future child threads are adjacent and get merged into one large VMA + // by the kernel, and subsequently transformed into huge pages by khugepaged + // - child threads come up, place JVM guard pages, thus splinter the large VMA, splinter + // the huge pages into many (still paged-in) small pages. + // The result of that sequence are thread stacks that are fully paged-in even though the + // threads did not even start yet. + // We prevent that by letting the glibc allocate a guard page, which causes a VMA with different + // permission bits to separate two ajacent thread stacks and therefore prevent merging stacks + // into one VMA. + // + // Yes, this means we have two guard sections - the glibc and the JVM one - per thread. But the + // cost for that one extra protected page is dwarfed from a large win in performance and memory + // that avoiding interference by khugepaged buys us. + return os::vm_page_size(); + } + // Creating guard page is very expensive. Java thread has HotSpot // guard pages, only enable glibc guard page for non-Java threads. // (Remember: compiler thread is a Java thread, too!) @@ -3740,6 +3769,21 @@ void os::large_page_init() { // Query OS information first. HugePages::initialize(); + // If THPs are unconditionally enabled (THP mode "always"), khugepaged may attempt to + // coalesce small pages in thread stacks to huge pages. That costs a lot of memory and + // is usually unwanted for thread stacks. Therefore we attempt to prevent THP formation in + // thread stacks unless the user explicitly allowed THP formation by manually disabling + // -XX:+DisableTHPStackMitigation. + if (HugePages::thp_mode() == THPMode::always) { + if (DisableTHPStackMitigation) { + log_info(pagesize)("JVM will *not* prevent THPs in thread stacks. This may cause high RSS."); + } else { + log_info(pagesize)("JVM will attempt to prevent THPs in thread stacks."); + } + } else { + FLAG_SET_ERGO(DisableTHPStackMitigation, true); // Mitigation not needed + } + // 1) Handle the case where we do not want to use huge pages if (!UseLargePages && !UseTransparentHugePages && diff --git a/test/hotspot/jtreg/runtime/os/THPsInThreadStackPreventionTest.java b/test/hotspot/jtreg/runtime/os/THPsInThreadStackPreventionTest.java new file mode 100644 index 00000000000..f16c5d66311 --- /dev/null +++ b/test/hotspot/jtreg/runtime/os/THPsInThreadStackPreventionTest.java @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, Red Hat Inc. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test id=ENABLED + * @bug 8303215 8312182 + * @summary On THP=always systems, we prevent THPs from forming within thread stacks + * @library /test/lib + * @requires os.family == "linux" + * @requires vm.debug + * @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64" + * @modules java.base/jdk.internal.misc + * java.management + * @run driver THPsInThreadStackPreventionTest PATCH-ENABLED + */ + +/* + * @test id=DISABLED + * @bug 8303215 8312182 + * @summary On THP=always systems, we prevent THPs from forming within thread stacks (negative test) + * @library /test/lib + * @requires os.family == "linux" + * @requires vm.debug + * @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64" + * @modules java.base/jdk.internal.misc + * java.management + * @run main/manual THPsInThreadStackPreventionTest PATCH-DISABLED + */ +import jdk.test.lib.process.OutputAnalyzer; +import jdk.test.lib.process.ProcessTools; +import jtreg.SkippedException; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Objects; +import java.util.concurrent.BrokenBarrierException; +import java.util.concurrent.CyclicBarrier; + +public class THPsInThreadStackPreventionTest { + + // We test the mitigation for "huge rss for THP=always" introduced with JDK-8312182 and JDK-8302015: + // + // We start a program that spawns a ton of threads with a stack size close to THP page size. The threads + // are idle and should not build up a lot of stack. The threads are started with an artificial delay + // between thread start and stack guardpage creation, which exacerbates the RSS bloat (for explanation + // please see 8312182). + // + // We then observe RSS of that program. We expect it to stay below a reasonable maximum. The unpatched + // version should show an RSS of ~2 GB (paying for the fully paged in thread stacks). The fixed variant should + // cost only ~200-400 MB. + + static final int numThreads = 1000; + static final long threadStackSizeMB = 2; // must be 2M + static final long heapSizeMB = 64; + static final long basicRSSOverheadMB = heapSizeMB + 150; + // A successful completion of this test would show not more than X KB per thread stack. + static final long acceptableRSSPerThreadStack = 128 * 1024; + static final long acceptableRSSForAllThreadStacks = numThreads * acceptableRSSPerThreadStack; + static final long acceptableRSSLimitMB = (acceptableRSSForAllThreadStacks / (1024 * 1024)) + basicRSSOverheadMB; + + private static class TestMain { + + static class Sleeper extends Thread { + CyclicBarrier barrier; + public Sleeper(CyclicBarrier barrier) { + this.barrier = barrier; + } + @Override + public void run() { + try { + barrier.await(); // wait for all siblings + barrier.await(); // wait main thread to print status + } catch (InterruptedException | BrokenBarrierException e) { + e.printStackTrace(); + } + } + } + + public static void main(String[] args) throws BrokenBarrierException, InterruptedException { + + // Fire up 1000 threads with 2M stack size each. + Sleeper[] threads = new Sleeper[numThreads]; + CyclicBarrier barrier = new CyclicBarrier(numThreads + 1); + + for (int i = 0; i < numThreads; i++) { + threads[i] = new Sleeper(barrier); + threads[i].start(); + } + + // Wait for all threads to come up + barrier.await(); + + // print status + String file = "/proc/self/status"; + try (FileReader fr = new FileReader(file); + BufferedReader reader = new BufferedReader(fr)) { + String line; + while ((line = reader.readLine()) != null) { + System.out.println(line); + } + } catch (IOException | NumberFormatException e) { /* ignored */ } + + // Signal threads to stop + barrier.await(); + + } + } + + static class ProcSelfStatus { + + public long rssMB; + public long swapMB; + public int numLifeThreads; + + // Parse output from /proc/self/status + public static ProcSelfStatus parse(OutputAnalyzer o) { + ProcSelfStatus status = new ProcSelfStatus(); + String s = o.firstMatch("Threads:\\s*(\\d+)", 1); + Objects.requireNonNull(s); + status.numLifeThreads = Integer.parseInt(s); + s = o.firstMatch("VmRSS:\\s*(\\d+) kB", 1); + Objects.requireNonNull(s); + status.rssMB = Long.parseLong(s) / 1024; + s = o.firstMatch("VmSwap:\\s*(\\d+) kB", 1); + Objects.requireNonNull(s); + status.swapMB = Long.parseLong(s) / 1024; + return status; + } + } + + public static void main(String[] args) throws Exception { + + HugePageConfiguration config = HugePageConfiguration.readFromOS(); + // This issue is bound to THP=always + if (config.getThpMode() != HugePageConfiguration.THPMode.always) { + throw new SkippedException("Test only makes sense in THP \"always\" mode"); + } + + String[] defaultArgs = { + "-Xlog:pagesize", + "-Xmx" + heapSizeMB + "m", "-Xms" + heapSizeMB + "m", "-XX:+AlwaysPreTouch", // stabilize RSS + "-Xss" + threadStackSizeMB + "m", + "-XX:-CreateCoredumpOnCrash", + // This will delay the child threads before they create guard pages, thereby greatly increasing the + // chance of large VMA formation + hugepage coalescation; see JDK-8312182 + "-XX:+DelayThreadStartALot" + }; + ArrayList finalargs = new ArrayList<>(Arrays.asList(defaultArgs)); + + switch (args[0]) { + case "PATCH-ENABLED": { + finalargs.add(TestMain.class.getName()); + ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs); + + OutputAnalyzer output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(0); + + // this line indicates the mitigation is active: + output.shouldContain("[pagesize] JVM will attempt to prevent THPs in thread stacks."); + + ProcSelfStatus status = ProcSelfStatus.parse(output); + if (status.numLifeThreads < numThreads) { + throw new RuntimeException("Number of live threads lower than expected: " + status.numLifeThreads + ", expected " + numThreads); + } else { + System.out.println("Found " + status.numLifeThreads + " to be alive. Ok."); + } + + long rssPlusSwapMB = status.swapMB + status.rssMB; + + if (rssPlusSwapMB > acceptableRSSLimitMB) { + throw new RuntimeException("RSS+Swap larger than expected: " + rssPlusSwapMB + "m, expected at most " + acceptableRSSLimitMB + "m"); + } else { + if (rssPlusSwapMB < heapSizeMB) { // we pretouch the java heap, so we expect to see at least that: + throw new RuntimeException("RSS+Swap suspiciously low: " + rssPlusSwapMB + "m, expected at least " + heapSizeMB + "m"); + } + System.out.println("Okay: RSS+Swap=" + rssPlusSwapMB + ", within acceptable limit of " + acceptableRSSLimitMB); + } + } + break; + + case "PATCH-DISABLED": { + + // Only execute manually! this will allocate ~2gb of memory! + + // explicitly disable the no-THP-workaround: + finalargs.add("-XX:+UnlockDiagnosticVMOptions"); + finalargs.add("-XX:+DisableTHPStackMitigation"); + + finalargs.add(TestMain.class.getName()); + ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs); + OutputAnalyzer output = new OutputAnalyzer(pb.start()); + + output.shouldHaveExitValue(0); + + // We deliberately switched off mitigation, VM should tell us: + output.shouldContain("[pagesize] JVM will *not* prevent THPs in thread stacks. This may cause high RSS."); + + // Parse output from self/status + ProcSelfStatus status = ProcSelfStatus.parse(output); + if (status.numLifeThreads < numThreads) { + throw new RuntimeException("Number of live threads lower than expected (" + status.numLifeThreads + ", expected " + numThreads +")"); + } else { + System.out.println("Found " + status.numLifeThreads + " to be alive. Ok."); + } + + long rssPlusSwapMB = status.swapMB + status.rssMB; + + if (rssPlusSwapMB < acceptableRSSLimitMB) { + throw new RuntimeException("RSS+Swap lower than expected: " + rssPlusSwapMB + "m, expected more than " + acceptableRSSLimitMB + "m"); + } + break; + } + + default: throw new RuntimeException("Bad argument: " + args[0]); + } + } +}