Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono][interp] Reduce false pinning from interp stack #100400

Merged
merged 3 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/mono/mono/metadata/class-getters.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate)
MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited)
MONO_CLASS_GETTER(m_class_has_cctor, gboolean, , MonoClass, has_cctor)
MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references)
MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields)
MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs)
MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields)
MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited)
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/mini/interp/interp-internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ struct InterpMethod {
MonoFtnDesc *ftndesc_unbox;
MonoDelegateTrampInfo *del_info;

/* locals_size is equal to the offset of the param_area */
guint32 locals_size;
guint32 alloca_size;
int num_clauses; // clauses
Expand All @@ -153,6 +154,7 @@ struct InterpMethod {
unsigned int hasthis; // boolean
MonoProfilerCallInstrumentationFlags prof_flags;
InterpMethodCodeType code_type;
MonoBitSet *ref_slots;
#ifdef ENABLE_EXPERIMENT_TIERED
MiniTieredCounter tiered_counter;
#endif
Expand Down Expand Up @@ -268,6 +270,8 @@ typedef struct {
guchar *stack_pointer;
/* Used for allocation of localloc regions */
FrameDataAllocator data_stack;
/* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */
guint8 *no_ref_slots;
} ThreadContext;

typedef struct {
Expand Down
73 changes: 70 additions & 3 deletions src/mono/mono/mini/interp/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,9 @@ get_context (void)
if (context == NULL) {
context = g_new0 (ThreadContext, 1);
context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
// A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array
if (mono_interp_opt & INTERP_OPT_PRECISE_GC)
context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE;
context->stack_real_end = context->stack_start + INTERP_STACK_SIZE;
/* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */
Expand Down Expand Up @@ -8011,6 +8014,8 @@ interp_parse_options (const char *options)
#endif
else if (strncmp (arg, "ssa", 3) == 0)
opt = INTERP_OPT_SSA;
else if (strncmp (arg, "precise", 7) == 0)
opt = INTERP_OPT_PRECISE_GC;
else if (strncmp (arg, "all", 3) == 0)
opt = ~INTERP_OPT_NONE;

Expand Down Expand Up @@ -8473,6 +8478,57 @@ interp_stop_single_stepping (void)
ss_enabled = FALSE;
}


static void
interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit)
{
InterpMethod *imethod = frame->imethod;
gpointer *frame_stack = (gpointer*)frame->stack;
gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size);
// The way interpreter implements calls is by moving arguments to the param area, at the
// top of the stack and then proceed with the call. Up to the moment of the call these slots
// are owned by the calling frame. Once we do the call, the stack pointer of the called
// frame will point inside the param area of the calling frame.
//
// We mark no ref slots from top to bottom and we use the top limit to ignore slots
// that were already handled in the called frame.
if (top_limit && top_limit < frame_stack_end)
frame_stack_end = top_limit;

for (gpointer *current = frame_stack; current < frame_stack_end; current++) {
gsize slot_index = current - frame_stack;
if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) {
gsize global_slot_index = current - (gpointer*)context->stack_start;
gsize table_index = global_slot_index / 8;
int bit_index = global_slot_index % 8;
context->no_ref_slots [table_index] |= 1 << bit_index;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use mono_bitset_set_fast here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, but the use case was simple enough that I went with the manual implementation.

}
}
}

static void
interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf)
{
memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1);
while (lmf) {
if ((gsize)lmf->previous_lmf & 2) {
MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf;
if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) {
InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data;
gpointer *top_limit = NULL;
while (frame) {
if (frame->imethod) {
interp_mark_frame_no_ref_slots (context, frame, top_limit);
top_limit = (gpointer*)frame->stack;
}
frame = frame->parent;
}
}
}
lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3);
}
}

/*
* interp_mark_stack:
*
Expand Down Expand Up @@ -8505,9 +8561,20 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo
if (!context || !context->stack_start)
return;

// FIXME: Scan the whole area with 1 call
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++)
func (p, gc_data);
if (mono_interp_opt & INTERP_OPT_PRECISE_GC) {
MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR];
if (lmf_addr)
interp_mark_no_ref_slots (context, *lmf_addr);
}

int slot_index = 0;
for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) {
if (context->no_ref_slots && (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8))))
;// This slot is marked as no ref, we don't scan it
else
func (p, gc_data);
slot_index++;
Comment on lines +8569 to +8576
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just summarizing so I understand how this all works.

The way this works is that sgen calls interp_mark_stack twice: once during the conservative phase and once during the precise phase (if the precise phase is enabled). This code runs during the conservative phase. The no_ref_slots bits are conservative: we're either sure there's definitely no managed pointer in the slot, or we're not sure what's in the slot. This is because sometimes we push managed pointers with MONO_TYPE_I.

So if we're sure the slot isn't a pointer, we don't scan it at all. otherwise if we're not sure - we scan it, and possibly create false pinning.

So it's not important that this PR precisely tracks every single managed pointer opcode. But if we see a slot that can't possibly contain a managed pointer, we may mark it and potentially avoid some false pinning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/cc @cshung FYI

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add for clarity that the ambiguity of MONO_TYPE_I is not really that relevant to the pinning story. Let's say we have two vars, one int32 and one of type object that are allocated at the same offset, since the liveness doesn't interesect. Because we mark these ref bits for the whole scope of the method, the offset will be marked as potentially containing a ref. At the moment of suspend, in theory, we could tell exactly whether the int32 var or the object var is alive at the moment of execution, or maybe none of them. This would probably reduce most of the remaining false pinning.

}

FrameDataFragment *frag;
for (frag = context->data_stack.first; frag; frag = frag->next) {
Expand Down
3 changes: 2 additions & 1 deletion src/mono/mono/mini/interp/interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ enum {
INTERP_OPT_JITERPRETER = 64,
#endif
INTERP_OPT_SSA = 128,
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA
INTERP_OPT_PRECISE_GC = 256,
INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA | INTERP_OPT_PRECISE_GC
#if HOST_BROWSER
| INTERP_OPT_JITERPRETER
#endif
Expand Down
7 changes: 6 additions & 1 deletion src/mono/mono/mini/interp/transform-opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos)
int
interp_alloc_global_var_offset (TransformData *td, int var)
{
return alloc_var_offset (td, var, &td->total_locals_size);
int offset = alloc_var_offset (td, var, &td->total_locals_size);
interp_mark_ref_slots_for_var (td, var);
return offset;
}

static void
Expand Down Expand Up @@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)
add_active_call (td, &ac, td->vars [var].call);
} else if (!td->vars [var].global && td->vars [var].offset == -1) {
alloc_var_offset (td, var, &current_offset);
interp_mark_ref_slots_for_var (td, var);

if (current_offset > final_total_locals_size)
final_total_locals_size = current_offset;

Expand Down Expand Up @@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)
// These are allocated separately at the end of the stack
if (td->vars [i].call_args) {
td->vars [i].offset += td->param_area_offset;
interp_mark_ref_slots_for_var (td, i);
final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);
}
}
Expand Down
Loading
Loading