Skip to content

Commit 997f09c

Browse files
authored
[mono][interp] Reduce false pinning from interp stack (#100400)
* [mono][interp] Reduce false pinning from interp stack Interpreter opcodes operate on the interp stack, an area of memory separately allocated. Each interp var will have an allocated stack offset in the current interpreter stack frame. When we allocate the storage for an interp var we can take into account the var type. If the type can represent a potential ref to an object or an interior ref then we mark the pointer slot as potentially containing refs, for the method that is being compiled. During GC, we used to conservatively scan the entire interp stack space used by each thread. After this change, in the first stage, we do a stack walkwhere we detect slots in each interp frame where no refs can reside. We mark these slots in a bit array. Afterwards we conservatively scan the interp stack of the thread, while ignoring slots that were previously marked as not containing any refs. System.Runtime.Tests suite was used for testing the effectiveness of the change, by computing the cumulative number of pinned objects throughout all GCs (about 1100). minijit - avg 702000 pinned objects old-interp - avg 641000 pinned objects precise-interp - avg 578000 pinned objects This resulted in 10% reduction in the number of pinned objects during collection. This change is meant to reduce memory usage of apps by making objects die earlier. We could further improve by being more precise. For example, for call sites we could reuse liveness information to precisely know which slots actually contain refs. This is a bit more complex to implement and it is unclear yet how impactful it would be. * [mono][interp] Add option to disable precise scanning of stack * [mono][interp] Fix pushing of byrefs on execution stack A lot of times, when we were pushing a byref type on the stack during compilation, we would first get the mint_type which would be MINT_TYPE_I4/I8. From the mint_type we would then obtain the STACK_TYPE_I4/I8, losing information because it should have been STACK_TYPE_MP. Because of this, the underlying interp var would end up being created as MONO_TYPE_I4/I8 instead of MONO_TYPE_I. Add another method for pushing directly a MonoType, with less confusing indirections. Code around here could further be refactored. This is only relevant for GC stack scanning, since we would want to scan only slots containing MONO_TYPE_I.
1 parent a971763 commit 997f09c

File tree

7 files changed

+203
-16
lines changed

7 files changed

+203
-16
lines changed

src/mono/mono/metadata/class-getters.h

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate)
3939
MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited)
4040
MONO_CLASS_GETTER(m_class_has_cctor, gboolean, , MonoClass, has_cctor)
4141
MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references)
42+
MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields)
4243
MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs)
4344
MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields)
4445
MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited)

src/mono/mono/mini/interp/interp-internals.h

+4
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ struct InterpMethod {
145145
MonoFtnDesc *ftndesc_unbox;
146146
MonoDelegateTrampInfo *del_info;
147147

148+
/* locals_size is equal to the offset of the param_area */
148149
guint32 locals_size;
149150
guint32 alloca_size;
150151
int num_clauses; // clauses
@@ -153,6 +154,7 @@ struct InterpMethod {
153154
unsigned int hasthis; // boolean
154155
MonoProfilerCallInstrumentationFlags prof_flags;
155156
InterpMethodCodeType code_type;
157+
MonoBitSet *ref_slots;
156158
#ifdef ENABLE_EXPERIMENT_TIERED
157159
MiniTieredCounter tiered_counter;
158160
#endif
@@ -268,6 +270,8 @@ typedef struct {
268270
guchar *stack_pointer;
269271
/* Used for allocation of localloc regions */
270272
FrameDataAllocator data_stack;
273+
/* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */
274+
guint8 *no_ref_slots;
271275
} ThreadContext;
272276

273277
typedef struct {

src/mono/mono/mini/interp/interp.c

+70-3
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,9 @@ get_context (void)
412412
if (context == NULL) {
413413
context = g_new0 (ThreadContext, 1);
414414
context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
415+
// A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array
416+
if (mono_interp_opt & INTERP_OPT_PRECISE_GC)
417+
context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
415418
context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE;
416419
context->stack_real_end = context->stack_start + INTERP_STACK_SIZE;
417420
/* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */
@@ -8011,6 +8014,8 @@ interp_parse_options (const char *options)
80118014
#endif
80128015
else if (strncmp (arg, "ssa", 3) == 0)
80138016
opt = INTERP_OPT_SSA;
8017+
else if (strncmp (arg, "precise", 7) == 0)
8018+
opt = INTERP_OPT_PRECISE_GC;
80148019
else if (strncmp (arg, "all", 3) == 0)
80158020
opt = ~INTERP_OPT_NONE;
80168021

@@ -8473,6 +8478,57 @@ interp_stop_single_stepping (void)
84738478
ss_enabled = FALSE;
84748479
}
84758480

8481+
8482+
static void
8483+
interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit)
8484+
{
8485+
InterpMethod *imethod = frame->imethod;
8486+
gpointer *frame_stack = (gpointer*)frame->stack;
8487+
gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size);
8488+
// The way interpreter implements calls is by moving arguments to the param area, at the
8489+
// top of the stack and then proceed with the call. Up to the moment of the call these slots
8490+
// are owned by the calling frame. Once we do the call, the stack pointer of the called
8491+
// frame will point inside the param area of the calling frame.
8492+
//
8493+
// We mark no ref slots from top to bottom and we use the top limit to ignore slots
8494+
// that were already handled in the called frame.
8495+
if (top_limit && top_limit < frame_stack_end)
8496+
frame_stack_end = top_limit;
8497+
8498+
for (gpointer *current = frame_stack; current < frame_stack_end; current++) {
8499+
gsize slot_index = current - frame_stack;
8500+
if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) {
8501+
gsize global_slot_index = current - (gpointer*)context->stack_start;
8502+
gsize table_index = global_slot_index / 8;
8503+
int bit_index = global_slot_index % 8;
8504+
context->no_ref_slots [table_index] |= 1 << bit_index;
8505+
}
8506+
}
8507+
}
8508+
8509+
static void
8510+
interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf)
8511+
{
8512+
memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1);
8513+
while (lmf) {
8514+
if ((gsize)lmf->previous_lmf & 2) {
8515+
MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf;
8516+
if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) {
8517+
InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data;
8518+
gpointer *top_limit = NULL;
8519+
while (frame) {
8520+
if (frame->imethod) {
8521+
interp_mark_frame_no_ref_slots (context, frame, top_limit);
8522+
top_limit = (gpointer*)frame->stack;
8523+
}
8524+
frame = frame->parent;
8525+
}
8526+
}
8527+
}
8528+
lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3);
8529+
}
8530+
}
8531+
84768532
/*
84778533
* interp_mark_stack:
84788534
*
@@ -8505,9 +8561,20 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo
85058561
if (!context || !context->stack_start)
85068562
return;
85078563

8508-
// FIXME: Scan the whole area with 1 call
8509-
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++)
8510-
func (p, gc_data);
8564+
if (mono_interp_opt & INTERP_OPT_PRECISE_GC) {
8565+
MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR];
8566+
if (lmf_addr)
8567+
interp_mark_no_ref_slots (context, *lmf_addr);
8568+
}
8569+
8570+
int slot_index = 0;
8571+
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) {
8572+
if (context->no_ref_slots && (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8))))
8573+
;// This slot is marked as no ref, we don't scan it
8574+
else
8575+
func (p, gc_data);
8576+
slot_index++;
8577+
}
85118578

85128579
FrameDataFragment *frag;
85138580
for (frag = context->data_stack.first; frag; frag = frag->next) {

src/mono/mono/mini/interp/interp.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ enum {
4242
INTERP_OPT_JITERPRETER = 64,
4343
#endif
4444
INTERP_OPT_SSA = 128,
45-
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA
45+
INTERP_OPT_PRECISE_GC = 256,
46+
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA | INTERP_OPT_PRECISE_GC
4647
#if HOST_BROWSER
4748
| INTERP_OPT_JITERPRETER
4849
#endif

src/mono/mono/mini/interp/transform-opt.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos)
3232
int
3333
interp_alloc_global_var_offset (TransformData *td, int var)
3434
{
35-
return alloc_var_offset (td, var, &td->total_locals_size);
35+
int offset = alloc_var_offset (td, var, &td->total_locals_size);
36+
interp_mark_ref_slots_for_var (td, var);
37+
return offset;
3638
}
3739

3840
static void
@@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)
464466
add_active_call (td, &ac, td->vars [var].call);
465467
} else if (!td->vars [var].global && td->vars [var].offset == -1) {
466468
alloc_var_offset (td, var, &current_offset);
469+
interp_mark_ref_slots_for_var (td, var);
470+
467471
if (current_offset > final_total_locals_size)
468472
final_total_locals_size = current_offset;
469473

@@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)
492496
// These are allocated separately at the end of the stack
493497
if (td->vars [i].call_args) {
494498
td->vars [i].offset += td->param_area_offset;
499+
interp_mark_ref_slots_for_var (td, i);
495500
final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);
496501
}
497502
}

0 commit comments

Comments
 (0)