Skip to content

Commit

Permalink
[mono][interp] Reduce false pinning from interp stack (dotnet#100400)
Browse files Browse the repository at this point in the history
* [mono][interp] Reduce false pinning from interp stack

Interpreter opcodes operate on the interp stack, an area of memory separately allocated. Each interp var will have an allocated stack offset in the current interpreter stack frame. When we allocate the storage for an interp var we can take into account the var type. If the type can represent a potential ref to an object or an interior ref then we mark the pointer slot as potentially containing refs, for the method that is being compiled.

During GC, we used to conservatively scan the entire interp stack space used by each thread. After this change, in the first stage, we do a stack walkwhere we detect slots in each interp frame where no refs can reside. We mark these slots in a bit array. Afterwards we conservatively scan the interp stack of the thread, while ignoring slots that were previously marked as not containing any refs.

System.Runtime.Tests suite was used for testing the effectiveness of the change, by computing the cumulative number of pinned objects throughout all GCs (about 1100).
minijit		- avg 702000 pinned objects
old-interp	- avg 641000 pinned objects
precise-interp	- avg 578000 pinned objects

This resulted in 10% reduction in the number of pinned objects during collection. This change is meant to reduce memory usage of apps by making objects die earlier. We could further improve by being more precise. For example, for call sites we could reuse liveness information to precisely know which slots actually contain refs. This is a bit more complex to implement and it is unclear yet how impactful it would be.

* [mono][interp] Add option to disable precise scanning of stack

* [mono][interp] Fix pushing of byrefs on execution stack

A lot of times, when we were pushing a byref type on the stack during compilation, we would first get the mint_type which would be MINT_TYPE_I4/I8. From the mint_type we would then obtain the STACK_TYPE_I4/I8, losing information because it should have been STACK_TYPE_MP. Because of this, the underlying interp var would end up being created as MONO_TYPE_I4/I8 instead of MONO_TYPE_I. Add another method for pushing directly a MonoType, with less confusing indirections. Code around here could further be refactored. This is only relevant for GC stack scanning, since we would want to scan only slots containing MONO_TYPE_I.
  • Loading branch information
BrzVlad authored and matouskozak committed Apr 30, 2024
1 parent b301e02 commit 29b4361
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 16 deletions.
1 change: 1 addition & 0 deletions src/mono/mono/metadata/class-getters.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate)
MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited)
MONO_CLASS_GETTER(m_class_has_cctor, gboolean, , MonoClass, has_cctor)
MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references)
MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields)
MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs)
MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields)
MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited)
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/mini/interp/interp-internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ struct InterpMethod {
MonoFtnDesc *ftndesc_unbox;
MonoDelegateTrampInfo *del_info;

/* locals_size is equal to the offset of the param_area */
guint32 locals_size;
guint32 alloca_size;
int num_clauses; // clauses
Expand All @@ -153,6 +154,7 @@ struct InterpMethod {
unsigned int hasthis; // boolean
MonoProfilerCallInstrumentationFlags prof_flags;
InterpMethodCodeType code_type;
MonoBitSet *ref_slots;
#ifdef ENABLE_EXPERIMENT_TIERED
MiniTieredCounter tiered_counter;
#endif
Expand Down Expand Up @@ -268,6 +270,8 @@ typedef struct {
guchar *stack_pointer;
/* Used for allocation of localloc regions */
FrameDataAllocator data_stack;
/* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */
guint8 *no_ref_slots;
} ThreadContext;

typedef struct {
Expand Down
73 changes: 70 additions & 3 deletions src/mono/mono/mini/interp/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,9 @@ get_context (void)
if (context == NULL) {
context = g_new0 (ThreadContext, 1);
context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
// A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array
if (mono_interp_opt & INTERP_OPT_PRECISE_GC)
context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE;
context->stack_real_end = context->stack_start + INTERP_STACK_SIZE;
/* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */
Expand Down Expand Up @@ -8011,6 +8014,8 @@ interp_parse_options (const char *options)
#endif
else if (strncmp (arg, "ssa", 3) == 0)
opt = INTERP_OPT_SSA;
else if (strncmp (arg, "precise", 7) == 0)
opt = INTERP_OPT_PRECISE_GC;
else if (strncmp (arg, "all", 3) == 0)
opt = ~INTERP_OPT_NONE;

Expand Down Expand Up @@ -8473,6 +8478,57 @@ interp_stop_single_stepping (void)
ss_enabled = FALSE;
}


static void
interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit)
{
InterpMethod *imethod = frame->imethod;
gpointer *frame_stack = (gpointer*)frame->stack;
gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size);
// The way interpreter implements calls is by moving arguments to the param area, at the
// top of the stack and then proceed with the call. Up to the moment of the call these slots
// are owned by the calling frame. Once we do the call, the stack pointer of the called
// frame will point inside the param area of the calling frame.
//
// We mark no ref slots from top to bottom and we use the top limit to ignore slots
// that were already handled in the called frame.
if (top_limit && top_limit < frame_stack_end)
frame_stack_end = top_limit;

for (gpointer *current = frame_stack; current < frame_stack_end; current++) {
gsize slot_index = current - frame_stack;
if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) {
gsize global_slot_index = current - (gpointer*)context->stack_start;
gsize table_index = global_slot_index / 8;
int bit_index = global_slot_index % 8;
context->no_ref_slots [table_index] |= 1 << bit_index;
}
}
}

static void
interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf)
{
memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1);
while (lmf) {
if ((gsize)lmf->previous_lmf & 2) {
MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf;
if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) {
InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data;
gpointer *top_limit = NULL;
while (frame) {
if (frame->imethod) {
interp_mark_frame_no_ref_slots (context, frame, top_limit);
top_limit = (gpointer*)frame->stack;
}
frame = frame->parent;
}
}
}
lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3);
}
}

/*
* interp_mark_stack:
*
Expand Down Expand Up @@ -8505,9 +8561,20 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo
if (!context || !context->stack_start)
return;

// FIXME: Scan the whole area with 1 call
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++)
func (p, gc_data);
if (mono_interp_opt & INTERP_OPT_PRECISE_GC) {
MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR];
if (lmf_addr)
interp_mark_no_ref_slots (context, *lmf_addr);
}

int slot_index = 0;
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) {
if (context->no_ref_slots && (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8))))
;// This slot is marked as no ref, we don't scan it
else
func (p, gc_data);
slot_index++;
}

FrameDataFragment *frag;
for (frag = context->data_stack.first; frag; frag = frag->next) {
Expand Down
3 changes: 2 additions & 1 deletion src/mono/mono/mini/interp/interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ enum {
INTERP_OPT_JITERPRETER = 64,
#endif
INTERP_OPT_SSA = 128,
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA
INTERP_OPT_PRECISE_GC = 256,
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA | INTERP_OPT_PRECISE_GC
#if HOST_BROWSER
| INTERP_OPT_JITERPRETER
#endif
Expand Down
7 changes: 6 additions & 1 deletion src/mono/mono/mini/interp/transform-opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos)
int
interp_alloc_global_var_offset (TransformData *td, int var)
{
return alloc_var_offset (td, var, &td->total_locals_size);
int offset = alloc_var_offset (td, var, &td->total_locals_size);
interp_mark_ref_slots_for_var (td, var);
return offset;
}

static void
Expand Down Expand Up @@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)
add_active_call (td, &ac, td->vars [var].call);
} else if (!td->vars [var].global && td->vars [var].offset == -1) {
alloc_var_offset (td, var, &current_offset);
interp_mark_ref_slots_for_var (td, var);

if (current_offset > final_total_locals_size)
final_total_locals_size = current_offset;

Expand Down Expand Up @@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)
// These are allocated separately at the end of the stack
if (td->vars [i].call_args) {
td->vars [i].offset += td->param_area_offset;
interp_mark_ref_slots_for_var (td, i);
final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);
}
}
Expand Down
Loading

0 comments on commit 29b4361

Please sign in to comment.