Skip to content

Commit 6468dae

Browse files
committed
Hi all, please review these changes to implement the `UseGCOverheadLimit` functionality for G1 (and make the implementation for Parallel GC have similar output). The `UseGCOverheadLimit` feature prematurely returns `null` from a GC if GC cpu usage limits and heap usage limits are met for some time. This is to avoid a VM limping along if garbage collection gets into an endless cycle of garbage collections or until a "real" OOME is thrown. What is important here is how this works (derived from the Parallel GC implementation): * check overheads at the end of the (initial) garbage collection (before upgrading) to see whether we are over the limits for a successive amount of GCs. * keep doing GCs without actually allocating memory for the allocation request to keep on measuring gc CPU usage. This is important for measuring the correct cpu usage in case of the application being able to free memory on the OOME. Testing: tier1-5 without any OOMEs due to this feature, test case Thanks, Thomas
1 parent 23057ab commit 6468dae

File tree

7 files changed

+198
-33
lines changed

7 files changed

+198
-33
lines changed

src/hotspot/share/gc/g1/g1CollectedHeap.cpp

Lines changed: 80 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
#include "utilities/globalDefinitions.hpp"
119119
#include "utilities/stack.inline.hpp"
120120

121+
uintx G1CollectedHeap::_gc_overhead_counter = 0;
121122
size_t G1CollectedHeap::_humongous_object_threshold_in_words = 0;
122123

123124
// INVARIANTS/NOTES
@@ -467,8 +468,20 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(uint node_index, size_t word_
467468
log_trace(gc, alloc)("%s: Unsuccessfully scheduled collection allocating %zu words",
468469
Thread::current()->name(), word_size);
469470

471+
if (is_shutting_down()) {
472+
stall_for_vm_shutdown();
473+
return nullptr;
474+
}
475+
476+
// Was the gc-overhead reached inside the safepoint? If so, this mutator
477+
// should return null even when unsuccessfully scheduling a collection as well
478+
// for global consistency.
479+
if (gc_overhead_limit_exceeded()) {
480+
return nullptr;
481+
}
482+
470483
// We can reach here if we were unsuccessful in scheduling a collection (because
471-
// another thread beat us to it). In this case immeditealy retry the allocation
484+
// another thread beat us to it). In this case immediately retry the allocation
472485
// attempt because another thread successfully performed a collection and possibly
473486
// reclaimed enough space. The first attempt (without holding the Heap_lock) is
474487
// here and the follow-on attempt will be at the start of the next loop
@@ -485,11 +498,6 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(uint node_index, size_t word_
485498
log_warning(gc, alloc)("%s: Retried allocation %u times for %zu words",
486499
Thread::current()->name(), try_count, word_size);
487500
}
488-
489-
if (is_shutting_down()) {
490-
stall_for_vm_shutdown();
491-
return nullptr;
492-
}
493501
}
494502

495503
ShouldNotReachHere();
@@ -714,6 +722,17 @@ HeapWord* G1CollectedHeap::attempt_allocation_humongous(size_t word_size) {
714722
log_trace(gc, alloc)("%s: Unsuccessfully scheduled collection allocating %zu",
715723
Thread::current()->name(), word_size);
716724

725+
if (is_shutting_down()) {
726+
stall_for_vm_shutdown();
727+
return nullptr;
728+
}
729+
730+
// Was the gc-overhead reached inside the safepoint? If so, this mutator
731+
// should return null as well for global consistency.
732+
if (gc_overhead_limit_exceeded()) {
733+
return nullptr;
734+
}
735+
717736
// We can reach here if we were unsuccessful in scheduling a collection (because
718737
// another thread beat us to it).
719738
// Humongous object allocation always needs a lock, so we wait for the retry
@@ -725,11 +744,6 @@ HeapWord* G1CollectedHeap::attempt_allocation_humongous(size_t word_size) {
725744
log_warning(gc, alloc)("%s: Retried allocation %u times for %zu words",
726745
Thread::current()->name(), try_count, word_size);
727746
}
728-
729-
if (is_shutting_down()) {
730-
stall_for_vm_shutdown();
731-
return nullptr;
732-
}
733747
}
734748

735749
ShouldNotReachHere();
@@ -955,25 +969,58 @@ void G1CollectedHeap::resize_heap_after_young_collection(size_t allocation_word_
955969
phase_times()->record_resize_heap_time((Ticks::now() - start).seconds() * 1000.0);
956970
}
957971

972+
void G1CollectedHeap::update_gc_overhead_limit_exceeded() {
973+
assert(SafepointSynchronize::is_at_safepoint(), "precondition");
974+
975+
if (UseGCOverheadLimit) {
976+
bool little_mutator_time = (_policy->analytics()->long_term_gc_time_ratio() * 100) >= GCTimeLimit;
977+
double free_space_percent = percent_of(num_available_regions() * G1HeapRegion::GrainBytes, max_capacity());
978+
bool little_free_space = free_space_percent < GCHeapFreeLimit;
979+
980+
log_debug(gc)("GC Overhead Limit: GC Time %f Free Space %f Counter %zu",
981+
(_policy->analytics()->long_term_gc_time_ratio() * 100),
982+
free_space_percent,
983+
_gc_overhead_counter);
984+
985+
if (little_mutator_time && little_free_space) {
986+
_gc_overhead_counter++;
987+
return;
988+
} else {
989+
_gc_overhead_counter = 0;
990+
}
991+
}
992+
}
993+
994+
bool G1CollectedHeap::gc_overhead_limit_exceeded() {
995+
return _gc_overhead_counter >= GCOverheadLimitThreshold;
996+
}
997+
958998
HeapWord* G1CollectedHeap::satisfy_failed_allocation_helper(size_t word_size,
959999
bool do_gc,
9601000
bool maximal_compaction,
9611001
bool expect_null_mutator_alloc_region) {
962-
// Let's attempt the allocation first.
963-
HeapWord* result =
964-
attempt_allocation_at_safepoint(word_size,
965-
expect_null_mutator_alloc_region);
966-
if (result != nullptr) {
967-
return result;
968-
}
1002+
// Skip allocation if GC overhead has been exceeded to let the mutator run into
1003+
// an OOME. It can either exit "gracefully" or try to free up memory asap.
1004+
// For the latter situation, keep running GCs. If the mutator frees up enough
1005+
// memory quickly enough, the overhead(s) will go below the threshold(s) again
1006+
// and the VM may continue running.
1007+
if (!gc_overhead_limit_exceeded()) {
1008+
// Let's attempt the allocation first.
1009+
HeapWord* result =
1010+
attempt_allocation_at_safepoint(word_size,
1011+
expect_null_mutator_alloc_region);
1012+
if (result != nullptr) {
1013+
return result;
1014+
}
9691015

970-
// In a G1 heap, we're supposed to keep allocation from failing by
971-
// incremental pauses. Therefore, at least for now, we'll favor
972-
// expansion over collection. (This might change in the future if we can
973-
// do something smarter than full collection to satisfy a failed alloc.)
974-
result = expand_and_allocate(word_size);
975-
if (result != nullptr) {
976-
return result;
1016+
// In a G1 heap, we're supposed to keep allocation from failing by
1017+
// incremental pauses. Therefore, at least for now, we'll favor
1018+
// expansion over collection. (This might change in the future if we can
1019+
// do something smarter than full collection to satisfy a failed alloc.)
1020+
result = expand_and_allocate(word_size);
1021+
if (result != nullptr) {
1022+
return result;
1023+
}
9771024
}
9781025

9791026
if (do_gc) {
@@ -997,6 +1044,10 @@ HeapWord* G1CollectedHeap::satisfy_failed_allocation_helper(size_t word_size,
9971044
HeapWord* G1CollectedHeap::satisfy_failed_allocation(size_t word_size) {
9981045
assert_at_safepoint_on_vm_thread();
9991046

1047+
// Update GC overhead limits after the initial garbage collection leading to this
1048+
// allocation attempt.
1049+
update_gc_overhead_limit_exceeded();
1050+
10001051
// Attempts to allocate followed by Full GC.
10011052
HeapWord* result =
10021053
satisfy_failed_allocation_helper(word_size,
@@ -1028,6 +1079,10 @@ HeapWord* G1CollectedHeap::satisfy_failed_allocation(size_t word_size) {
10281079
return result;
10291080
}
10301081

1082+
if (gc_overhead_limit_exceeded()) {
1083+
log_info(gc)("GC Overhead Limit exceeded too often (%zu).", GCOverheadLimitThreshold);
1084+
}
1085+
10311086
// What else? We might try synchronous finalization later. If the total
10321087
// space available is large enough for the allocation, then a more
10331088
// complete compaction phase than we've tried so far might be

src/hotspot/share/gc/g1/g1CollectedHeap.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,17 @@ class G1CollectedHeap : public CollectedHeap {
169169
friend class G1CheckRegionAttrTableClosure;
170170

171171
private:
172+
// GC Overhead Limit functionality related members.
173+
//
174+
// The goal is to return null for allocations prematurely (before really going
175+
// OOME) in case both GC CPU usage (>= GCTimeLimit) and not much available free
176+
// memory (<= GCHeapFreeLimit) so that applications can exit gracefully or try
177+
// to keep running by easing off memory.
178+
static uintx _gc_overhead_counter; // The amount of successive times we were over the limits.
179+
180+
void update_gc_overhead_limit_exceeded();
181+
static bool gc_overhead_limit_exceeded();
182+
172183
G1ServiceThread* _service_thread;
173184
G1ServiceTask* _periodic_gc_task;
174185
G1MonotonicArenaFreeMemoryTask* _free_arena_memory_task;

src/hotspot/share/gc/parallel/parallelArguments.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,6 @@ void ParallelArguments::initialize() {
6666
}
6767
}
6868

69-
// True in product build, since tests using debug build often stress GC
70-
if (FLAG_IS_DEFAULT(UseGCOverheadLimit)) {
71-
FLAG_SET_DEFAULT(UseGCOverheadLimit, trueInProduct);
72-
}
73-
7469
if (InitialSurvivorRatio < MinSurvivorRatio) {
7570
if (FLAG_IS_CMDLINE(InitialSurvivorRatio)) {
7671
if (FLAG_IS_CMDLINE(MinSurvivorRatio)) {

src/hotspot/share/gc/parallel/parallelScavengeHeap.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,13 @@ bool ParallelScavengeHeap::check_gc_overhead_limit() {
367367
bool little_mutator_time = _size_policy->mutator_time_percent() * 100 < (100 - GCTimeLimit);
368368
bool little_free_space = check_gc_heap_free_limit(_young_gen->free_in_bytes(), _young_gen->capacity_in_bytes())
369369
&& check_gc_heap_free_limit( _old_gen->free_in_bytes(), _old_gen->capacity_in_bytes());
370+
371+
log_debug(gc)("GC Overhead Limit: GC Time %f Free Space Young %f Old %f Counter %zu",
372+
(100 - _size_policy->mutator_time_percent()),
373+
percent_of(_young_gen->free_in_bytes(), _young_gen->capacity_in_bytes()),
374+
percent_of(_old_gen->free_in_bytes(), _young_gen->capacity_in_bytes()),
375+
_gc_overhead_counter);
376+
370377
if (little_mutator_time && little_free_space) {
371378
_gc_overhead_counter++;
372379
if (_gc_overhead_counter >= GCOverheadLimitThreshold) {
@@ -419,7 +426,7 @@ HeapWord* ParallelScavengeHeap::satisfy_failed_allocation(size_t size, bool is_t
419426
}
420427

421428
if (check_gc_overhead_limit()) {
422-
log_info(gc)("GCOverheadLimitThreshold %zu reached.", GCOverheadLimitThreshold);
429+
log_info(gc)("GC Overhead Limit exceeded too often (%zu).", GCOverheadLimitThreshold);
423430
return nullptr;
424431
}
425432

src/hotspot/share/gc/parallel/parallelScavengeHeap.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class ParallelScavengeHeap : public CollectedHeap {
8585

8686
WorkerThreads _workers;
8787

88-
uint _gc_overhead_counter;
88+
uintx _gc_overhead_counter;
8989

9090
bool _is_heap_almost_full;
9191

src/hotspot/share/gc/shared/gc_globals.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@
357357
"Initial ratio of young generation/survivor space size") \
358358
range(3, max_uintx) \
359359
\
360-
product(bool, UseGCOverheadLimit, true, \
360+
product(bool, UseGCOverheadLimit, falseInDebug, \
361361
"Use policy to limit of proportion of time spent in GC " \
362362
"before an OutOfMemory error is thrown") \
363363
\
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
package gc;
25+
26+
/*
27+
* @test id=Parallel
28+
* @requires vm.gc.Parallel
29+
* @requires !vm.debug
30+
* @summary Verifies that the UseGCOverheadLimit functionality works in Parallel GC.
31+
* @library /test/lib
32+
* @run driver gc.TestUseGCOverheadLimit Parallel
33+
*/
34+
35+
/*
36+
* @test id=G1
37+
* @requires vm.gc.G1
38+
* @requires !vm.debug
39+
* @summary Verifies that the UseGCOverheadLimit functionality works in G1 GC.
40+
* @library /test/lib
41+
* @run driver gc.TestUseGCOverheadLimit G1
42+
*/
43+
44+
import java.util.Arrays;
45+
import java.util.stream.Stream;
46+
47+
import jdk.test.lib.Asserts;
48+
import jdk.test.lib.process.OutputAnalyzer;
49+
import jdk.test.lib.process.ProcessTools;
50+
51+
public class TestUseGCOverheadLimit {
52+
public static void main(String args[]) throws Exception {
53+
String[] parallelArgs = {
54+
"-XX:+UseParallelGC",
55+
"-XX:NewSize=20m",
56+
"-XX:SurvivorRatio=5",
57+
"-XX:GCHeapFreeLimit=30" // Parallel GC needs to eat up all of young gen.
58+
};
59+
String[] g1Args = {
60+
"-XX:+UseG1GC",
61+
"-XX:GCHeapFreeLimit=5"
62+
};
63+
64+
String[] selectedArgs = args[0].equals("G1") ? g1Args : parallelArgs;
65+
66+
final String[] commonArgs = {
67+
"-XX:-UseCompactObjectHeaders", // Object sizes are calculated such that the heap is tight.
68+
"-XX:ParallelGCThreads=1", // Make GCs take longer.
69+
"-XX:+UseGCOverheadLimit",
70+
"-Xlog:gc=debug",
71+
"-XX:GCTimeLimit=90", // Ease the CPU requirement a little.
72+
"-Xmx128m",
73+
Allocating.class.getName()
74+
};
75+
76+
String[] vmArgs = Stream.concat(Arrays.stream(selectedArgs), Arrays.stream(commonArgs)).toArray(String[]::new);
77+
OutputAnalyzer output = ProcessTools.executeLimitedTestJava(vmArgs);
78+
output.shouldNotHaveExitValue(0);
79+
80+
System.out.println(output.getStdout());
81+
82+
Asserts.assertTrue(output.getStdout().indexOf("GC Overhead Limit exceeded too often (5).") != -1,
83+
"Could not find indication that we failed because of GC overhead limit.");
84+
}
85+
86+
static class Allocating {
87+
public static void main(String[] args) {
88+
Object[] cache = new Object[1024 * 1024 * 2];
89+
90+
// Allocate random objects, keeping around most of the data.
91+
for (int i = 0; i < 1024* 1024 * 30; i++) {
92+
Object[] obj = new Object[10];
93+
cache[i % cache.length] = obj;
94+
}
95+
}
96+
}
97+
}

0 commit comments

Comments
 (0)