Skip to content

Commit

Permalink
gc: add some guard rails and refinements
Browse files Browse the repository at this point in the history
Some transients cause the MemBalancer heuristics to compute odd values.
Since there is not a background thread monitoring these rates, smooth
out these transients at each interval and add a sequence of hard limits
that iteratively refines these estimates.

Some principles here:
 - Hard limits should always be applied as a MIN or MAX, not just
 applied directly (which might go too far or in the wrong direction).
 - The max heap should alter the allocation heuristics regime to change
 from the time based balancer to a proportional limit. The
 overallocation function accordingly is changed from a simple sqrt,
 which tends to start off too fast and end up too slow, into a low power
 polynomial (taken from array.c).
  • Loading branch information
vtjnash committed Nov 16, 2023
1 parent 96e70e6 commit 10aa815
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 40 deletions.
10 changes: 4 additions & 6 deletions src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -979,27 +979,25 @@ void gc_time_summary(int sweep_full, uint64_t start, uint64_t end,

void gc_heuristics_summary(
uint64_t old_alloc_diff, uint64_t alloc_mem,
uint64_t old_nongc_time, uint64_t nongc_time,
uint64_t old_mut_time, uint64_t alloc_time,
uint64_t old_freed_diff, uint64_t gc_mem,
uint64_t old_pause_time, uint64_t gc_time,
int thrash_counter,
int thrash_counter, const char *reason,
uint64_t current_heap, uint64_t target_heap)
{
jl_safe_printf("Estimates: alloc_diff=%" PRIu64 "kB (%" PRIu64 ")"
" nongc_time=%" PRIu64 "ns (%" PRIu64 ")"
//" nongc_time=%" PRIu64 "ns (%" PRIu64 ")"
" mut_time=%" PRIu64 "ns (%" PRIu64 ")"
" freed_diff=%" PRIu64 "kB (%" PRIu64 ")"
" pause_time=%" PRIu64 "ns (%" PRIu64 ")"
" thrash_counter=%d"
" thrash_counter=%d%s"
" current_heap=%" PRIu64 " MB"
" target_heap=%" PRIu64 " MB\n",
old_alloc_diff/1024, alloc_mem/1024,
old_nongc_time/1000, nongc_time/1000,
old_mut_time/1000, alloc_time/1000,
old_freed_diff/1024, gc_mem/1024,
old_pause_time/1000, gc_time/1000,
thrash_counter,
thrash_counter, reason,
current_heap/1024/1024, target_heap/1024/1024);
}
#endif
Expand Down
124 changes: 94 additions & 30 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,6 @@ static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
// start with values that are in the target ranges to reduce transient hiccups at startup
static uint64_t old_pause_time = 1e7; // 10 ms
static uint64_t old_mut_time = 1e9; // 1 second
static uint64_t old_nongc_time = 1e9; // 1 second
static uint64_t old_heap_size = 0;
static uint64_t old_alloc_diff = default_collect_interval;
static uint64_t old_freed_diff = default_collect_interval;
Expand Down Expand Up @@ -3250,6 +3249,29 @@ uint64_t jl_gc_smooth(uint64_t old_val, uint64_t new_val, double factor)
return est;
}

// an overallocation curve inspired by array allocations
// grows very fast initially, then much slower at large heaps
static uint64_t overallocation(uint64_t old_val, uint64_t val, uint64_t max_val)
{
// compute maxsize = maxsize + 8*maxsize^(7/8) + maxsize/4
// for small n, we grow much faster than O(n)
// for large n, we grow at O(n/4)
// and as we reach O(memory) for memory>>1MB,
// this means we end by adding about 20% of memory each time at most
int exp2 = sizeof(old_val) * 8 -
#ifdef _P64
__builtin_clzll(old_val);
#else
__builtin_clz(old_val);
#endif
uint64_t inc = (uint64_t)((size_t)1 << (exp2 * 7 / 8)) * 8 + old_val / 4;
// once overallocation would exceed max_val, grow by no more than 10% of max_val
if (inc + val > max_val)
if (inc > max_val / 10)
return max_val / 10;
return inc;
}

size_t jl_maxrss(void);

// Only one thread should be running in this function
Expand All @@ -3264,7 +3286,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
jl_gc_markqueue_t *mq = &ptls->mark_queue;

uint64_t gc_start_time = jl_hrtime();
uint64_t mutator_time = gc_end_time == 0 ? old_nongc_time : gc_start_time - gc_end_time;
uint64_t mutator_time = gc_end_time == 0 ? old_mut_time : gc_start_time - gc_end_time;
uint64_t before_free_heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
int64_t last_perm_scanned_bytes = perm_scanned_bytes;
uint64_t start_mark_time = jl_hrtime();
Expand Down Expand Up @@ -3439,58 +3461,101 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
gc_num.last_incremental_sweep = gc_end_time;
}

// update the estimator of the GC performance and program behavior
size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
double target_allocs = 0.0;
uint64_t min_interval = default_collect_interval;
uint64_t user_max = max_total_memory * 0.8;
uint64_t alloc_diff = before_free_heap_size - old_heap_size;
uint64_t freed_diff = before_free_heap_size - heap_size;
uint64_t target_heap;
const char *reason = ""; (void)reason; // for GC_TIME output stats
old_heap_size = heap_size; // TODO: Update these values dynamically instead of just during the GC
if (collection == JL_GC_AUTO) {
// do not update any heuristics when the user forces GC
// update any heuristics only when the user does not force the GC
// but still update the timings, since GC was run and reset, even if it was too early
uint64_t target_allocs = 0.0;
double alloc_smooth_factor = 0.95;
double collect_smooth_factor = 0.5;
double tuning_factor = 0.03;
double tuning_factor = 1e5;
uint64_t alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor);
uint64_t nongc_time = jl_gc_smooth(old_nongc_time, mutator_time + sweep_time, alloc_smooth_factor); // Charge sweeping to the mutator
uint64_t alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor); // TODO: subtract estimated finalizer time?
uint64_t gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collect_smooth_factor);
uint64_t gc_time = jl_gc_smooth(old_pause_time, pause - sweep_time, collect_smooth_factor);
old_alloc_diff = alloc_mem;
old_nongc_time = nongc_time;
old_mut_time = alloc_time;
old_freed_diff = gc_mem;
old_pause_time = gc_time;
if (gc_time > alloc_time && !(thrash_counter < 4)) // thrashing if GC marking more than 50% of the runtime
// thrashing estimator: if GC time more than 50% of the runtime
if (pause > mutator_time && !(thrash_counter < 4))
thrash_counter += 1;
else if (thrash_counter > 0)
thrash_counter -= 1;
if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) {
if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0) {
double alloc_rate = (double)alloc_mem/alloc_time;
double gc_rate = (double)gc_mem/gc_time;
target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval
target_allocs = sqrt((double)heap_size * alloc_rate / gc_rate) * tuning_factor;
}

if (thrashing == 0 && thrash_counter >= 3) {
// require 3 consecutive thrashing cycles to force the default allocator rate
thrashing = 1;
// and require 4 default allocations to clear
thrash_counter = 6;
}
else if (thrashing == 1 && thrash_counter <= 2) {
thrashing = 0; // maybe we should report this to the user or error out?
}
old_heap_size = heap_size; // TODO: Update these values dynamically instead of just during the GC
}
if (thrashing == 0 && thrash_counter >= 3)
thrashing = 1;
else if (thrashing == 1 && thrash_counter <= 2)
thrashing = 0; // maybe we should report this to the user or error out?

int bad_result = (target_allocs*min_interval + heap_size) > 2 * jl_atomic_load_relaxed(&gc_heap_stats.heap_target); // Don't follow through on a bad decision
if (target_allocs == 0.0 || thrashing || bad_result) // If we are thrashing go back to default
target_allocs = 2*sqrt((double)heap_size/min_interval);
uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size;
if (target_heap > max_total_memory && !thrashing) // Allow it to go over if we are thrashing if we die we die
target_heap = max_total_memory;
else if (target_heap < default_collect_interval)
target_heap = default_collect_interval;
jl_atomic_store_relaxed(&gc_heap_stats.heap_target, target_heap);
target_heap = target_allocs + heap_size;
// optionally smooth this:
// target_heap = jl_gc_smooth(jl_atomic_load_relaxed(&gc_heap_stats.heap_target), target_heap, alloc_smooth_factor);

// compute some guardrails values
uint64_t min_target_allocs = heap_size / 10; // minimum 10% of current heap
if (min_target_allocs < default_collect_interval / 8) // unless the heap is small
min_target_allocs = default_collect_interval / 8;
uint64_t max_target_allocs = overallocation(before_free_heap_size, heap_size, user_max);
if (max_target_allocs < min_target_allocs)
max_target_allocs = min_target_allocs;
// respect max_total_memory first
if (target_heap > user_max) {
target_allocs = heap_size < user_max ? user_max - heap_size : 1;
reason = " user limit";
}
// If we are thrashing use a default only (an average) for a couple collections
if (thrashing) {
uint64_t thrashing_allocs = sqrt((double)min_target_allocs * max_target_allocs);
if (target_allocs < thrashing_allocs) {
target_allocs = thrashing_allocs;
reason = " thrashing";
}
}
// then add the guardrails for transient issues
if (target_allocs > max_target_allocs) {
target_allocs = max_target_allocs;
reason = " rate limit max";
}
else if (target_allocs < min_target_allocs) {
target_allocs = min_target_allocs;
reason = " min limit";
}
// and set the heap detection threshold
target_heap = target_allocs + heap_size;
if (target_heap < default_collect_interval) {
target_heap = default_collect_interval;
reason = " min heap";
}
jl_atomic_store_relaxed(&gc_heap_stats.heap_target, target_heap);
}
else {
target_heap = jl_atomic_load_relaxed(&gc_heap_stats.heap_target);
}

double old_ratio = (double)promoted_bytes/(double)heap_size;
if (heap_size > max_total_memory * 0.8 || old_ratio > 0.15)
if (heap_size > user_max || old_ratio > 0.15)
next_sweep_full = 1;
else
next_sweep_full = 0;
if (heap_size > max_total_memory * 0.8 || thrashing)
if (heap_size > user_max || thrashing)
under_pressure = 1;
// sweeping is over
// 7. if it is a quick sweep, put back the remembered objects in queued state
Expand Down Expand Up @@ -3543,11 +3608,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
if (collection == JL_GC_AUTO) {
gc_heuristics_summary(
old_alloc_diff, alloc_diff,
old_nongc_time, mutator_time + sweep_time,
old_mut_time, mutator_time,
old_freed_diff, freed_diff,
old_pause_time, pause - sweep_time,
thrash_counter,
thrash_counter, reason,
heap_size, target_heap);
}

Expand Down
6 changes: 2 additions & 4 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,11 +521,10 @@ void gc_time_summary(int sweep_full, uint64_t start, uint64_t end,
uint64_t sweep);
void gc_heuristics_summary(
uint64_t old_alloc_diff, uint64_t alloc_mem,
uint64_t old_nongc_time, uint64_t nongc_time,
uint64_t old_mut_time, uint64_t alloc_time,
uint64_t old_freed_diff, uint64_t gc_mem,
uint64_t old_pause_time, uint64_t gc_time,
int thrash_counter,
int thrash_counter, const char *reason,
uint64_t current_heap, uint64_t target_heap);
#else
#define gc_time_pool_start()
Expand Down Expand Up @@ -556,11 +555,10 @@ STATIC_INLINE void gc_time_count_mallocd_memory(int bits) JL_NOTSAFEPOINT
interval, pause, ttsp, mark, sweep)
#define gc_heuristics_summary( \
old_alloc_diff, alloc_mem, \
old_nongc_time, nongc_time, \
old_mut_time, alloc_time, \
old_freed_diff, gc_mem, \
old_pause_time, gc_time, \
thrash_counter, \
thrash_counter, reason, \
current_heap, target_heap)
#endif

Expand Down

0 comments on commit 10aa815

Please sign in to comment.