Skip to content

Commit 08a4289

Browse files
committed
[mono][interp] Reduce false pinning from interp stack
Interpreter opcodes operate on the interp stack, an area of memory separately allocated. Each interp var will have an allocated stack offset in the current interpreter stack frame. When we allocate the storage for an interp var we can take into account the var type. If the type can represent a potential ref to an object or an interior ref then we mark the pointer slot as potentially containing refs, for the method that is being compiled. During GC, we used to conservatively scan the entire interp stack space used by each thread. After this change, in the first stage, we do a stack walkwhere we detect slots in each interp frame where no refs can reside. We mark these slots in a bit array. Afterwards we conservatively scan the interp stack of the thread, while ignoring slots that were previously marked as not containing any refs. System.Runtime.Tests suite was used for testing the effectiveness of the change, by computing the cumulative number of pinned objects throughout all GCs (about 1100). minijit - avg 702000 pinned objects old-interp - avg 641000 pinned objects precise-interp - avg 578000 pinned objects This resulted in 10% reduction in the number of pinned objects during collection. This change is meant to reduce memory usage of apps by making objects die earlier. We could further improve by being more precise. For example, for call sites we could reuse liveness information to precisely know which slots actually contain refs. This is a bit more complex to implement and it is unclear yet how impactful it would be.
1 parent 71e989d commit 08a4289

File tree

6 files changed

+167
-4
lines changed

6 files changed

+167
-4
lines changed

src/mono/mono/metadata/class-getters.h

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate)
3939
MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited)
4040
MONO_CLASS_GETTER(m_class_has_cctor, gboolean, , MonoClass, has_cctor)
4141
MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references)
42+
MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields)
4243
MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs)
4344
MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields)
4445
MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited)

src/mono/mono/mini/interp/interp-internals.h

+4
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ struct InterpMethod {
145145
MonoFtnDesc *ftndesc_unbox;
146146
MonoDelegateTrampInfo *del_info;
147147

148+
/* locals_size is equal to the offset of the param_area */
148149
guint32 locals_size;
149150
guint32 alloca_size;
150151
int num_clauses; // clauses
@@ -153,6 +154,7 @@ struct InterpMethod {
153154
unsigned int hasthis; // boolean
154155
MonoProfilerCallInstrumentationFlags prof_flags;
155156
InterpMethodCodeType code_type;
157+
MonoBitSet *ref_slots;
156158
#ifdef ENABLE_EXPERIMENT_TIERED
157159
MiniTieredCounter tiered_counter;
158160
#endif
@@ -268,6 +270,8 @@ typedef struct {
268270
guchar *stack_pointer;
269271
/* Used for allocation of localloc regions */
270272
FrameDataAllocator data_stack;
273+
/* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */
274+
guint8 *no_ref_slots;
271275
} ThreadContext;
272276

273277
typedef struct {

src/mono/mono/mini/interp/interp.c

+65-3
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,8 @@ get_context (void)
412412
if (context == NULL) {
413413
context = g_new0 (ThreadContext, 1);
414414
context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
415+
// A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array
416+
context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
415417
context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE;
416418
context->stack_real_end = context->stack_start + INTERP_STACK_SIZE;
417419
/* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */
@@ -8473,6 +8475,57 @@ interp_stop_single_stepping (void)
84738475
ss_enabled = FALSE;
84748476
}
84758477

8478+
8479+
static void
8480+
interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit)
8481+
{
8482+
InterpMethod *imethod = frame->imethod;
8483+
gpointer *frame_stack = (gpointer*)frame->stack;
8484+
gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size);
8485+
// The way interpreter implements calls is by moving arguments to the param area, at the
8486+
// top of the stack and then proceed with the call. Up to the moment of the call these slots
8487+
// are owned by the calling frame. Once we do the call, the stack pointer of the called
8488+
// frame will point inside the param area of the calling frame.
8489+
//
8490+
// We mark no ref slots from top to bottom and we use the top limit to ignore slots
8491+
// that were already handled in the called frame.
8492+
if (top_limit && top_limit < frame_stack_end)
8493+
frame_stack_end = top_limit;
8494+
8495+
for (gpointer *current = frame_stack; current < frame_stack_end; current++) {
8496+
gsize slot_index = current - frame_stack;
8497+
if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) {
8498+
gsize global_slot_index = current - (gpointer*)context->stack_start;
8499+
gsize table_index = global_slot_index / 8;
8500+
int bit_index = global_slot_index % 8;
8501+
context->no_ref_slots [table_index] |= 1 << bit_index;
8502+
}
8503+
}
8504+
}
8505+
8506+
static void
8507+
interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf)
8508+
{
8509+
memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1);
8510+
while (lmf) {
8511+
if ((gsize)lmf->previous_lmf & 2) {
8512+
MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf;
8513+
if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) {
8514+
InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data;
8515+
gpointer *top_limit = NULL;
8516+
while (frame) {
8517+
if (frame->imethod) {
8518+
interp_mark_frame_no_ref_slots (context, frame, top_limit);
8519+
top_limit = (gpointer*)frame->stack;
8520+
}
8521+
frame = frame->parent;
8522+
}
8523+
}
8524+
}
8525+
lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3);
8526+
}
8527+
}
8528+
84768529
/*
84778530
* interp_mark_stack:
84788531
*
@@ -8505,9 +8558,18 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo
85058558
if (!context || !context->stack_start)
85068559
return;
85078560

8508-
// FIXME: Scan the whole area with 1 call
8509-
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++)
8510-
func (p, gc_data);
8561+
MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR];
8562+
if (lmf_addr)
8563+
interp_mark_no_ref_slots (context, *lmf_addr);
8564+
8565+
int slot_index = 0;
8566+
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) {
8567+
if (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8)))
8568+
;// This slot is marked as no ref, we don't scan it
8569+
else
8570+
func (p, gc_data);
8571+
slot_index++;
8572+
}
85118573

85128574
FrameDataFragment *frag;
85138575
for (frag = context->data_stack.first; frag; frag = frag->next) {

src/mono/mono/mini/interp/transform-opt.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos)
3232
int
3333
interp_alloc_global_var_offset (TransformData *td, int var)
3434
{
35-
return alloc_var_offset (td, var, &td->total_locals_size);
35+
int offset = alloc_var_offset (td, var, &td->total_locals_size);
36+
interp_mark_ref_slots_for_var (td, var);
37+
return offset;
3638
}
3739

3840
static void
@@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)
464466
add_active_call (td, &ac, td->vars [var].call);
465467
} else if (!td->vars [var].global && td->vars [var].offset == -1) {
466468
alloc_var_offset (td, var, &current_offset);
469+
interp_mark_ref_slots_for_var (td, var);
470+
467471
if (current_offset > final_total_locals_size)
468472
final_total_locals_size = current_offset;
469473

@@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)
492496
// These are allocated separately at the end of the stack
493497
if (td->vars [i].call_args) {
494498
td->vars [i].offset += td->param_area_offset;
499+
interp_mark_ref_slots_for_var (td, i);
495500
final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);
496501
}
497502
}

src/mono/mono/mini/interp/transform.c

+87
Original file line numberDiff line numberDiff line change
@@ -4346,6 +4346,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet
43464346
td->vars [i].size = size;
43474347
offset = ALIGN_TO (offset, align);
43484348
td->vars [i].offset = offset;
4349+
interp_mark_ref_slots_for_var (td, i);
43494350
offset += size;
43504351
}
43514352
offset = ALIGN_TO (offset, MINT_STACK_ALIGNMENT);
@@ -4371,6 +4372,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet
43714372
td->vars [index].mt = mono_mint_type (header->locals [i]);
43724373
td->vars [index].ext_index = -1;
43734374
td->vars [index].size = size;
4375+
interp_mark_ref_slots_for_var (td, index);
43744376
// Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals
43754377
offset += size;
43764378
}
@@ -8507,6 +8509,75 @@ get_short_brop (int opcode)
85078509
return opcode;
85088510
}
85098511

8512+
static void
8513+
interp_mark_ref_slots_for_vt (TransformData *td, int base_offset, MonoClass *klass)
8514+
{
8515+
if (!m_class_has_references (klass) && !m_class_has_ref_fields (klass))
8516+
return;
8517+
8518+
gpointer iter = NULL;
8519+
MonoClassField *field;
8520+
while ((field = mono_class_get_fields_internal (klass, &iter))) {
8521+
MonoType *ftype = mono_field_get_type_internal (field);
8522+
if (ftype->attrs & FIELD_ATTRIBUTE_STATIC)
8523+
continue;
8524+
int offset = base_offset + m_field_get_offset (field) - MONO_ABI_SIZEOF (MonoObject);
8525+
retry:
8526+
if (mini_type_is_reference (ftype) || ftype->type == MONO_TYPE_I || ftype->type == MONO_TYPE_U || m_type_is_byref (ftype)) {
8527+
int index = offset / sizeof (gpointer);
8528+
mono_bitset_set_fast (td->ref_slots, index);
8529+
if (td->verbose_level)
8530+
g_print ("Stack ref slot vt field at off %d\n", offset);
8531+
} else if (ftype->type == MONO_TYPE_VALUETYPE || ftype->type == MONO_TYPE_GENERICINST) {
8532+
interp_mark_ref_slots_for_vt (td, offset, mono_class_from_mono_type_internal (ftype));
8533+
}
8534+
8535+
if (m_class_is_inlinearray (klass)) {
8536+
int max_offset = base_offset + m_class_get_instance_size (klass) - MONO_ABI_SIZEOF (MonoObject);
8537+
int align;
8538+
int field_size = mono_type_size (ftype, &align);
8539+
offset += field_size;
8540+
offset = ALIGN_TO (offset, align);
8541+
if (offset < max_offset)
8542+
goto retry;
8543+
}
8544+
}
8545+
}
8546+
8547+
void
8548+
interp_mark_ref_slots_for_var (TransformData *td, int var)
8549+
{
8550+
g_assert (td->vars [var].offset != -1);
8551+
8552+
gsize max_index = (td->vars [var].offset + td->vars [var].size) / sizeof (gpointer);
8553+
8554+
if (!td->ref_slots || max_index >= td->ref_slots->size) {
8555+
guint32 old_size = td->ref_slots ? (guint32)td->ref_slots->size : 0;
8556+
guint32 new_size = old_size ? old_size * 2 : 32;
8557+
8558+
gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (new_size, 0));
8559+
MonoBitSet *new_ref_slots = mono_bitset_mem_new (mem, new_size, 0);
8560+
8561+
if (old_size)
8562+
memcpy (&new_ref_slots->data, &td->ref_slots->data, old_size / 8);
8563+
td->ref_slots = new_ref_slots;
8564+
}
8565+
8566+
MonoType *type = td->vars [var].type;
8567+
if (td->vars [var].mt == MINT_TYPE_VT) {
8568+
MonoClass *klass = mono_class_from_mono_type_internal (type);
8569+
interp_mark_ref_slots_for_vt (td, td->vars [var].offset, klass);
8570+
} else {
8571+
// Managed pointers in interp are normally MONO_TYPE_I
8572+
if (mini_type_is_reference (type) || type->type == MONO_TYPE_I || type->type == MONO_TYPE_U || m_type_is_byref (type)) {
8573+
int index = td->vars [var].offset / sizeof (gpointer);
8574+
mono_bitset_set_fast (td->ref_slots, index);
8575+
if (td->verbose_level)
8576+
g_print ("Stack ref slot at off %d for var %d\n", index * sizeof (gpointer), var);
8577+
}
8578+
}
8579+
}
8580+
85108581
static int
85118582
get_var_offset (TransformData *td, int var)
85128583
{
@@ -8526,6 +8597,7 @@ get_var_offset (TransformData *td, int var)
85268597
g_assert (td->vars [var].execution_stack);
85278598

85288599
td->vars [var].offset = td->total_locals_size + td->vars [var].stack_offset;
8600+
interp_mark_ref_slots_for_var (td, var);
85298601
return td->vars [var].offset;
85308602
}
85318603

@@ -9155,6 +9227,21 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG
91559227
mono_interp_register_imethod_data_items (rtm->data_items, td->imethod_items);
91569228
rtm->patchpoint_data = td->patchpoint_data;
91579229

9230+
if (td->ref_slots) {
9231+
gpointer ref_slots_mem = mono_mem_manager_alloc0 (td->mem_manager, mono_bitset_alloc_size (rtm->alloca_size / sizeof (gpointer), 0));
9232+
rtm->ref_slots = mono_bitset_mem_new (ref_slots_mem, rtm->alloca_size / sizeof (gpointer), 0);
9233+
gsize copy_size = rtm->ref_slots->size;
9234+
if (td->ref_slots->size < copy_size)
9235+
copy_size = td->ref_slots->size;
9236+
memcpy (&rtm->ref_slots->data, &td->ref_slots->data, copy_size / 8);
9237+
if (!td->optimized) {
9238+
// Unoptimized code can have some stack slot moving patterns as part of calls.
9239+
// Just conservatively mark all these slots as potentially containing refs.
9240+
for (guint32 offset = rtm->locals_size; offset < rtm->alloca_size; offset += sizeof (gpointer))
9241+
mono_bitset_set (rtm->ref_slots, offset / sizeof (gpointer));
9242+
}
9243+
}
9244+
91589245
/* Save debug info */
91599246
interp_save_debug_info (rtm, header, td, td->line_numbers);
91609247

src/mono/mono/mini/interp/transform.h

+4
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ typedef struct
340340
int inline_depth;
341341
int patchpoint_data_n;
342342
int *patchpoint_data;
343+
// This marks each stack slot offset that might contain refs throughout the execution of this method
344+
MonoBitSet *ref_slots;
343345
guint has_localloc : 1;
344346
// If method compilation fails due to certain limits being exceeded, we disable inlining
345347
// and retry compilation.
@@ -543,6 +545,8 @@ interp_foreach_ins_var (TransformData *td, InterpInst *ins, gpointer data, void
543545
void
544546
interp_foreach_ins_svar (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int*, gpointer));
545547

548+
void
549+
interp_mark_ref_slots_for_var (TransformData *td, int var);
546550

547551
/* Forward definitions for simd methods */
548552
static gboolean

0 commit comments

Comments
 (0)