8212084

tschatzl · tschatzl · commit 6468daefc30d · 2025-10-22T12:55:18.000+02:00
Hi all,

  please review these changes to implement the `UseGCOverheadLimit` functionality for G1 (and make the implementation for Parallel GC have similar output).

The `UseGCOverheadLimit` feature prematurely returns `null` from a GC if GC cpu usage limits and heap usage limits are met for some time. This is to avoid a VM limping along if garbage collection gets into an endless cycle of garbage collections or until a "real" OOME is thrown.

What is important here is how this works (derived from the Parallel GC implementation):

* check overheads at the end of the (initial) garbage collection (before upgrading) to see whether we are over the limits for a successive amount of GCs.
* keep doing GCs without actually allocating memory for the allocation request to keep on measuring gc CPU usage. This is important for measuring the correct cpu usage in case of the application being able to free memory on the OOME.

Testing: tier1-5 without any OOMEs due to this feature, test case

Thanks,
  Thomas
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@@ -118,6 +118,7 @@
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/stack.inline.hpp"
 
+uintx G1CollectedHeap::_gc_overhead_counter = 0;
 size_t G1CollectedHeap::_humongous_object_threshold_in_words = 0;
 
 // INVARIANTS/NOTES
@@ -467,8 +468,20 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(uint node_index, size_t word_
     log_trace(gc, alloc)("%s: Unsuccessfully scheduled collection allocating %zu words",
                          Thread::current()->name(), word_size);
 
+    if (is_shutting_down()) {
+      stall_for_vm_shutdown();
+      return nullptr;
+    }
+
+    // Was the gc-overhead reached inside the safepoint? If so, this mutator
+    // should return null even when unsuccessfully scheduling a collection as well
+    // for global consistency.
+    if (gc_overhead_limit_exceeded()) {
+      return nullptr;
+    }
+
     // We can reach here if we were unsuccessful in scheduling a collection (because
-    // another thread beat us to it). In this case immeditealy retry the allocation
+    // another thread beat us to it). In this case immediately retry the allocation
     // attempt because another thread successfully performed a collection and possibly
     // reclaimed enough space. The first attempt (without holding the Heap_lock) is
     // here and the follow-on attempt will be at the start of the next loop
@@ -485,11 +498,6 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(uint node_index, size_t word_
       log_warning(gc, alloc)("%s:  Retried allocation %u times for %zu words",
                              Thread::current()->name(), try_count, word_size);
     }
-
-    if (is_shutting_down()) {
-      stall_for_vm_shutdown();
-      return nullptr;
-    }
   }
 
   ShouldNotReachHere();
@@ -714,6 +722,17 @@ HeapWord* G1CollectedHeap::attempt_allocation_humongous(size_t word_size) {
     log_trace(gc, alloc)("%s: Unsuccessfully scheduled collection allocating %zu",
                          Thread::current()->name(), word_size);
 
+    if (is_shutting_down()) {
+      stall_for_vm_shutdown();
+      return nullptr;
+    }
+
+    // Was the gc-overhead reached inside the safepoint? If so, this mutator
+    // should return null as well for global consistency.
+    if (gc_overhead_limit_exceeded()) {
+      return nullptr;
+    }
+
     // We can reach here if we were unsuccessful in scheduling a collection (because
     // another thread beat us to it).
     // Humongous object allocation always needs a lock, so we wait for the retry
@@ -725,11 +744,6 @@ HeapWord* G1CollectedHeap::attempt_allocation_humongous(size_t word_size) {
       log_warning(gc, alloc)("%s: Retried allocation %u times for %zu words",
                              Thread::current()->name(), try_count, word_size);
     }
-
-    if (is_shutting_down()) {
-      stall_for_vm_shutdown();
-      return nullptr;
-    }
   }
 
   ShouldNotReachHere();
@@ -955,25 +969,58 @@ void G1CollectedHeap::resize_heap_after_young_collection(size_t allocation_word_
   phase_times()->record_resize_heap_time((Ticks::now() - start).seconds() * 1000.0);
 }
 
+void G1CollectedHeap::update_gc_overhead_limit_exceeded() {
+  assert(SafepointSynchronize::is_at_safepoint(), "precondition");
+
+  if (UseGCOverheadLimit) {
+    bool little_mutator_time = (_policy->analytics()->long_term_gc_time_ratio() * 100) >= GCTimeLimit;
+    double free_space_percent = percent_of(num_available_regions() * G1HeapRegion::GrainBytes, max_capacity());
+    bool little_free_space = free_space_percent < GCHeapFreeLimit;
+
+    log_debug(gc)("GC Overhead Limit: GC Time %f Free Space %f Counter %zu",
+                  (_policy->analytics()->long_term_gc_time_ratio() * 100),
+                  free_space_percent,
+                  _gc_overhead_counter);
+
+    if (little_mutator_time && little_free_space) {
+      _gc_overhead_counter++;
+      return;
+    } else {
+      _gc_overhead_counter = 0;
+    }
+  }
+}
+
+bool G1CollectedHeap::gc_overhead_limit_exceeded() {
+  return _gc_overhead_counter >= GCOverheadLimitThreshold;
+}
+
 HeapWord* G1CollectedHeap::satisfy_failed_allocation_helper(size_t word_size,
                                                             bool do_gc,
                                                             bool maximal_compaction,
                                                             bool expect_null_mutator_alloc_region) {
-  // Let's attempt the allocation first.
-  HeapWord* result =
-    attempt_allocation_at_safepoint(word_size,
-                                    expect_null_mutator_alloc_region);
-  if (result != nullptr) {
-    return result;
-  }
+  // Skip allocation if GC overhead has been exceeded to let the mutator run into
+  // an OOME. It can either exit "gracefully" or try to free up memory asap.
+  // For the latter situation, keep running GCs. If the mutator frees up enough
+  // memory quickly enough, the overhead(s) will go below the threshold(s) again
+  // and the VM may continue running.
+  if (!gc_overhead_limit_exceeded()) {
+    // Let's attempt the allocation first.
+    HeapWord* result =
+      attempt_allocation_at_safepoint(word_size,
+                                      expect_null_mutator_alloc_region);
+    if (result != nullptr) {
+      return result;
+    }
 
-  // In a G1 heap, we're supposed to keep allocation from failing by
-  // incremental pauses.  Therefore, at least for now, we'll favor
-  // expansion over collection.  (This might change in the future if we can
-  // do something smarter than full collection to satisfy a failed alloc.)
-  result = expand_and_allocate(word_size);
-  if (result != nullptr) {
-    return result;
+    // In a G1 heap, we're supposed to keep allocation from failing by
+    // incremental pauses.  Therefore, at least for now, we'll favor
+    // expansion over collection.  (This might change in the future if we can
+    // do something smarter than full collection to satisfy a failed alloc.)
+    result = expand_and_allocate(word_size);
+    if (result != nullptr) {
+      return result;
+    }
   }
 
   if (do_gc) {
@@ -997,6 +1044,10 @@ HeapWord* G1CollectedHeap::satisfy_failed_allocation_helper(size_t word_size,
 HeapWord* G1CollectedHeap::satisfy_failed_allocation(size_t word_size) {
   assert_at_safepoint_on_vm_thread();
 
+  // Update GC overhead limits after the initial garbage collection leading to this
+  // allocation attempt.
+  update_gc_overhead_limit_exceeded();
+
   // Attempts to allocate followed by Full GC.
   HeapWord* result =
     satisfy_failed_allocation_helper(word_size,
@@ -1028,6 +1079,10 @@ HeapWord* G1CollectedHeap::satisfy_failed_allocation(size_t word_size) {
     return result;
   }
 
+  if (gc_overhead_limit_exceeded()) {
+    log_info(gc)("GC Overhead Limit exceeded too often (%zu).", GCOverheadLimitThreshold);
+  }
+
   // What else?  We might try synchronous finalization later.  If the total
   // space available is large enough for the allocation, then a more
   // complete compaction phase than we've tried so far might be
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
@@ -169,6 +169,17 @@ class G1CollectedHeap : public CollectedHeap {
   friend class G1CheckRegionAttrTableClosure;
 
 private:
+  // GC Overhead Limit functionality related members.
+  //
+  // The goal is to return null for allocations prematurely (before really going
+  // OOME) in case both GC CPU usage (>= GCTimeLimit) and not much available free
+  // memory (<= GCHeapFreeLimit) so that applications can exit gracefully or try
+  // to keep running by easing off memory.
+  static uintx _gc_overhead_counter;        // The amount of successive times we were over the limits.
+
+  void update_gc_overhead_limit_exceeded();
+  static bool gc_overhead_limit_exceeded();
+
   G1ServiceThread* _service_thread;
   G1ServiceTask* _periodic_gc_task;
   G1MonotonicArenaFreeMemoryTask* _free_arena_memory_task;
diff --git a/src/hotspot/share/gc/parallel/parallelArguments.cpp b/src/hotspot/share/gc/parallel/parallelArguments.cpp
@@ -66,11 +66,6 @@ void ParallelArguments::initialize() {
     }
   }
 
-  // True in product build, since tests using debug build often stress GC
-  if (FLAG_IS_DEFAULT(UseGCOverheadLimit)) {
-    FLAG_SET_DEFAULT(UseGCOverheadLimit, trueInProduct);
-  }
-
   if (InitialSurvivorRatio < MinSurvivorRatio) {
     if (FLAG_IS_CMDLINE(InitialSurvivorRatio)) {
       if (FLAG_IS_CMDLINE(MinSurvivorRatio)) {
diff --git a/src/hotspot/share/gc/parallel/parallelScavengeHeap.cpp b/src/hotspot/share/gc/parallel/parallelScavengeHeap.cpp
@@ -367,6 +367,13 @@ bool ParallelScavengeHeap::check_gc_overhead_limit() {
     bool little_mutator_time = _size_policy->mutator_time_percent() * 100 < (100 - GCTimeLimit);
     bool little_free_space = check_gc_heap_free_limit(_young_gen->free_in_bytes(), _young_gen->capacity_in_bytes())
                           && check_gc_heap_free_limit(  _old_gen->free_in_bytes(),   _old_gen->capacity_in_bytes());
+
+    log_debug(gc)("GC Overhead Limit: GC Time %f Free Space Young %f Old %f Counter %zu",
+                  (100 - _size_policy->mutator_time_percent()),
+                  percent_of(_young_gen->free_in_bytes(), _young_gen->capacity_in_bytes()),
+                  percent_of(_old_gen->free_in_bytes(), _young_gen->capacity_in_bytes()),
+                  _gc_overhead_counter);
+
     if (little_mutator_time && little_free_space) {
       _gc_overhead_counter++;
       if (_gc_overhead_counter >= GCOverheadLimitThreshold) {
@@ -419,7 +426,7 @@ HeapWord* ParallelScavengeHeap::satisfy_failed_allocation(size_t size, bool is_t
   }
 
   if (check_gc_overhead_limit()) {
-    log_info(gc)("GCOverheadLimitThreshold %zu reached.", GCOverheadLimitThreshold);
+    log_info(gc)("GC Overhead Limit exceeded too often (%zu).", GCOverheadLimitThreshold);
     return nullptr;
   }
 
diff --git a/src/hotspot/share/gc/parallel/parallelScavengeHeap.hpp b/src/hotspot/share/gc/parallel/parallelScavengeHeap.hpp
@@ -85,7 +85,7 @@ class ParallelScavengeHeap : public CollectedHeap {
 
   WorkerThreads _workers;
 
-  uint _gc_overhead_counter;
+  uintx _gc_overhead_counter;
 
   bool _is_heap_almost_full;
 
diff --git a/src/hotspot/share/gc/shared/gc_globals.hpp b/src/hotspot/share/gc/shared/gc_globals.hpp
@@ -357,7 +357,7 @@
           "Initial ratio of young generation/survivor space size")          \
           range(3, max_uintx)                                               \
                                                                             \
-  product(bool, UseGCOverheadLimit, true,                                   \
+  product(bool, UseGCOverheadLimit, falseInDebug,                           \
           "Use policy to limit of proportion of time spent in GC "          \
           "before an OutOfMemory error is thrown")                          \
                                                                             \
diff --git a/test/hotspot/jtreg/gc/TestUseGCOverheadLimit.java b/test/hotspot/jtreg/gc/TestUseGCOverheadLimit.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package gc;
+
+/*
+ * @test id=Parallel
+ * @requires vm.gc.Parallel
+ * @requires !vm.debug
+ * @summary Verifies that the UseGCOverheadLimit functionality works in Parallel GC.
+ * @library /test/lib
+ * @run driver gc.TestUseGCOverheadLimit Parallel
+ */
+
+/*
+ * @test id=G1
+ * @requires vm.gc.G1
+ * @requires !vm.debug
+ * @summary Verifies that the UseGCOverheadLimit functionality works in G1 GC.  
+ * @library /test/lib
+ * @run driver gc.TestUseGCOverheadLimit G1
+ */
+
+import java.util.Arrays;
+import java.util.stream.Stream;
+
+import jdk.test.lib.Asserts;
+import jdk.test.lib.process.OutputAnalyzer;
+import jdk.test.lib.process.ProcessTools;
+
+public class TestUseGCOverheadLimit {
+  public static void main(String args[]) throws Exception {
+    String[] parallelArgs = {
+      "-XX:+UseParallelGC",
+      "-XX:NewSize=20m",
+      "-XX:SurvivorRatio=5",
+      "-XX:GCHeapFreeLimit=30"  // Parallel GC needs to eat up all of young gen.
+    };
+    String[] g1Args = {
+      "-XX:+UseG1GC",
+      "-XX:GCHeapFreeLimit=5"
+    };
+
+    String[] selectedArgs = args[0].equals("G1") ? g1Args : parallelArgs;
+
+    final String[] commonArgs = {
+      "-XX:-UseCompactObjectHeaders", // Object sizes are calculated such that the heap is tight.
+      "-XX:ParallelGCThreads=1",      // Make GCs take longer.
+      "-XX:+UseGCOverheadLimit",
+      "-Xlog:gc=debug",
+      "-XX:GCTimeLimit=90",           // Ease the CPU requirement a little.
+      "-Xmx128m",
+      Allocating.class.getName()
+    };
+
+    String[] vmArgs = Stream.concat(Arrays.stream(selectedArgs), Arrays.stream(commonArgs)).toArray(String[]::new);
+    OutputAnalyzer output = ProcessTools.executeLimitedTestJava(vmArgs);
+    output.shouldNotHaveExitValue(0);
+
+    System.out.println(output.getStdout());
+
+    Asserts.assertTrue(output.getStdout().indexOf("GC Overhead Limit exceeded too often (5).") != -1,
+                       "Could not find indication that we failed because of GC overhead limit.");
+  }
+
+  static class Allocating {
+	public static void main(String[] args) {
+      Object[] cache = new Object[1024 * 1024 * 2];
+
+      // Allocate random objects, keeping around most of the data.
+      for (int i = 0; i < 1024* 1024 * 30; i++) {
+        Object[] obj = new Object[10];
+        cache[i % cache.length] = obj;
+      }
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -66,11 +66,6 @@ void ParallelArguments::initialize() {`
`66`	`66`	`}`
`67`	`67`	`}`
`68`	`68`
`69`		`- // True in product build, since tests using debug build often stress GC`
`70`		`- if (FLAG_IS_DEFAULT(UseGCOverheadLimit)) {`
`71`		`- FLAG_SET_DEFAULT(UseGCOverheadLimit, trueInProduct);`
`72`		`- }`
`73`		`-`
`74`	`69`	`if (InitialSurvivorRatio < MinSurvivorRatio) {`
`75`	`70`	`if (FLAG_IS_CMDLINE(InitialSurvivorRatio)) {`
`76`	`71`	`if (FLAG_IS_CMDLINE(MinSurvivorRatio)) {`
Original file line number	Diff line number	Diff line change
`@@ -357,7 +357,7 @@`
`357`	`357`	`"Initial ratio of young generation/survivor space size") \`
`358`	`358`	`range(3, max_uintx) \`
`359`	`359`	`\`
`360`		`- product(bool, UseGCOverheadLimit, true, \`
	`360`	`+ product(bool, UseGCOverheadLimit, falseInDebug, \`
`361`	`361`	`"Use policy to limit of proportion of time spent in GC " \`
`362`	`362`	`"before an OutOfMemory error is thrown") \`
`363`	`363`	`\`