[mono][interp] Reduce false pinning from interp stack

BrzVlad · BrzVlad · commit 08a42895e707 · 2024-04-04T11:46:32.000+03:00
Interpreter opcodes operate on the interp stack, an area of memory separately allocated. Each interp var will have an allocated stack offset in the current interpreter stack frame. When we allocate the storage for an interp var we can take into account the var type. If the type can represent a potential ref to an object or an interior ref then we mark the pointer slot as potentially containing refs, for the method that is being compiled.

During GC, we used to conservatively scan the entire interp stack space used by each thread. After this change, in the first stage, we do a stack walkwhere we detect slots in each interp frame where no refs can reside. We mark these slots in a bit array. Afterwards we conservatively scan the interp stack of the thread, while ignoring slots that were previously marked as not containing any refs.

System.Runtime.Tests suite was used for testing the effectiveness of the change, by computing the cumulative number of pinned objects throughout all GCs (about 1100).
minijit		- avg 702000 pinned objects
old-interp	- avg 641000 pinned objects
precise-interp	- avg 578000 pinned objects

This resulted in 10% reduction in the number of pinned objects during collection. This change is meant to reduce memory usage of apps by making objects die earlier. We could further improve by being more precise. For example, for call sites we could reuse liveness information to precisely know which slots actually contain refs. This is a bit more complex to implement and it is unclear yet how impactful it would be.
diff --git a/src/mono/mono/metadata/class-getters.h b/src/mono/mono/metadata/class-getters.h
@@ -39,6 +39,7 @@ MONO_CLASS_GETTER(m_class_is_delegate, gboolean, , MonoClass, delegate)
 MONO_CLASS_GETTER(m_class_is_gc_descr_inited, gboolean, , MonoClass, gc_descr_inited)
 MONO_CLASS_GETTER(m_class_has_cctor, gboolean,  , MonoClass, has_cctor)
 MONO_CLASS_GETTER(m_class_has_references, gboolean, , MonoClass, has_references)
+MONO_CLASS_GETTER(m_class_has_ref_fields, gboolean, , MonoClass, has_ref_fields)
 MONO_CLASS_GETTER(m_class_has_static_refs, gboolean, , MonoClass, has_static_refs)
 MONO_CLASS_GETTER(m_class_has_no_special_static_fields, gboolean, , MonoClass, no_special_static_fields)
 MONO_CLASS_GETTER(m_class_is_nested_classes_inited, gboolean, , MonoClass, nested_classes_inited)
diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h
@@ -145,6 +145,7 @@ struct InterpMethod {
 	MonoFtnDesc *ftndesc_unbox;
 	MonoDelegateTrampInfo *del_info;
 
+	/* locals_size is equal to the offset of the param_area */
 	guint32 locals_size;
 	guint32 alloca_size;
 	int num_clauses; // clauses
@@ -153,6 +154,7 @@ struct InterpMethod {
 	unsigned int hasthis; // boolean
 	MonoProfilerCallInstrumentationFlags prof_flags;
 	InterpMethodCodeType code_type;
+	MonoBitSet *ref_slots;
 #ifdef ENABLE_EXPERIMENT_TIERED
 	MiniTieredCounter tiered_counter;
 #endif
@@ -268,6 +270,8 @@ typedef struct {
 	guchar *stack_pointer;
 	/* Used for allocation of localloc regions */
 	FrameDataAllocator data_stack;
+	/* If bit n is set, it means that the n-th stack slot (pointer sized) from stack_start doesn't contain any refs */
+	guint8 *no_ref_slots;
 } ThreadContext;
 
 typedef struct {
diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c
@@ -412,6 +412,8 @@ get_context (void)
 	if (context == NULL) {
 		context = g_new0 (ThreadContext, 1);
 		context->stack_start = (guchar*)mono_valloc_aligned (INTERP_STACK_SIZE, MINT_STACK_ALIGNMENT, MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
+		// A bit for every pointer sized slot in the stack. FIXME don't allocate whole bit array
+		context->no_ref_slots = (guchar*)mono_valloc (NULL, INTERP_STACK_SIZE / (8 * sizeof (gpointer)), MONO_MMAP_READ | MONO_MMAP_WRITE, MONO_MEM_ACCOUNT_INTERP_STACK);
 		context->stack_end = context->stack_start + INTERP_STACK_SIZE - INTERP_REDZONE_SIZE;
 		context->stack_real_end = context->stack_start + INTERP_STACK_SIZE;
 		/* We reserve a stack slot at the top of the interp stack to make temp objects visible to GC */
@@ -8473,6 +8475,57 @@ interp_stop_single_stepping (void)
 	ss_enabled = FALSE;
 }
 
+
+static void
+interp_mark_frame_no_ref_slots (ThreadContext *context, InterpFrame *frame, gpointer *top_limit)
+{
+	InterpMethod *imethod = frame->imethod;
+	gpointer *frame_stack = (gpointer*)frame->stack;
+	gpointer *frame_stack_end = (gpointer*)((guchar*)frame->stack + imethod->alloca_size);
+	// The way interpreter implements calls is by moving arguments to the param area, at the
+	// top of the stack and then proceed with the call. Up to the moment of the call these slots
+	// are owned by the calling frame. Once we do the call, the stack pointer of the called
+	// frame will point inside the param area of the calling frame.
+	//
+	// We mark no ref slots from top to bottom and we use the top limit to ignore slots
+	// that were already handled in the called frame.
+	if (top_limit && top_limit < frame_stack_end)
+		frame_stack_end = top_limit;
+
+	for (gpointer *current = frame_stack; current < frame_stack_end; current++) {
+		gsize slot_index = current - frame_stack;
+		if (!mono_bitset_test_fast (imethod->ref_slots, slot_index)) {
+			gsize global_slot_index = current - (gpointer*)context->stack_start;
+			gsize table_index = global_slot_index / 8;
+			int bit_index = global_slot_index % 8;
+			context->no_ref_slots [table_index] |= 1 << bit_index;
+		}
+	}
+}
+
+static void
+interp_mark_no_ref_slots (ThreadContext *context, MonoLMF* lmf)
+{
+	memset (context->no_ref_slots, 0, (context->stack_pointer - context->stack_start) / (8 * sizeof (gpointer)) + 1);
+	while (lmf) {
+		if ((gsize)lmf->previous_lmf & 2) {
+			MonoLMFExt *lmf_ext = (MonoLMFExt*) lmf;
+			if (lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT || lmf_ext->kind == MONO_LMFEXT_INTERP_EXIT_WITH_CTX) {
+				InterpFrame *frame = (InterpFrame*)lmf_ext->interp_exit_data;
+				gpointer *top_limit = NULL;
+				while (frame) {
+					if (frame->imethod) {
+						interp_mark_frame_no_ref_slots (context, frame, top_limit);
+						top_limit = (gpointer*)frame->stack;
+					}
+					frame = frame->parent;
+				}
+			}
+		}
+		lmf = (MonoLMF*)((gsize)lmf->previous_lmf & ~3);
+	}
+}
+
 /*
  * interp_mark_stack:
  *
@@ -8505,9 +8558,18 @@ interp_mark_stack (gpointer thread_data, GcScanFunc func, gpointer gc_data, gboo
 	if (!context || !context->stack_start)
 		return;
 
-	// FIXME: Scan the whole area with 1 call
-	for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++)
-		func (p, gc_data);
+	MonoLMF **lmf_addr = (MonoLMF**)info->tls [TLS_KEY_LMF_ADDR];
+	if (lmf_addr)
+		interp_mark_no_ref_slots (context, *lmf_addr);
+
+	int slot_index = 0;
+	for (gpointer *p = (gpointer*)context->stack_start; p < (gpointer*)context->stack_pointer; p++) {
+		if (context->no_ref_slots [slot_index / 8] & (1 << (slot_index % 8)))
+			;// This slot is marked as no ref, we don't scan it
+		else
+			func (p, gc_data);
+		slot_index++;
+	}
 
 	FrameDataFragment *frag;
 	for (frag = context->data_stack.first; frag; frag = frag->next) {
diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c
@@ -32,7 +32,9 @@ alloc_var_offset (TransformData *td, int local, gint32 *ptos)
 int
 interp_alloc_global_var_offset (TransformData *td, int var)
 {
-	return alloc_var_offset (td, var, &td->total_locals_size);
+	int offset = alloc_var_offset (td, var, &td->total_locals_size);
+	interp_mark_ref_slots_for_var (td, var);
+	return offset;
 }
 
 static void
@@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)
 					add_active_call (td, &ac, td->vars [var].call);
 				} else if (!td->vars [var].global && td->vars [var].offset == -1) {
 					alloc_var_offset (td, var, &current_offset);
+					interp_mark_ref_slots_for_var (td, var);
+
 					if (current_offset > final_total_locals_size)
 						final_total_locals_size = current_offset;
 
@@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)
 		// These are allocated separately at the end of the stack
 		if (td->vars [i].call_args) {
 			td->vars [i].offset += td->param_area_offset;
+			interp_mark_ref_slots_for_var (td, i);
 			final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);
 		}
 	}
diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c
@@ -4346,6 +4346,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet
 		td->vars [i].size = size;
 		offset = ALIGN_TO (offset, align);
 		td->vars [i].offset = offset;
+		interp_mark_ref_slots_for_var (td, i);
 		offset += size;
 	}
 	offset = ALIGN_TO (offset, MINT_STACK_ALIGNMENT);
@@ -4371,6 +4372,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet
 		td->vars [index].mt = mono_mint_type (header->locals [i]);
 		td->vars [index].ext_index = -1;
 		td->vars [index].size = size;
+		interp_mark_ref_slots_for_var (td, index);
 		// Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals
 		offset += size;
 	}
@@ -8507,6 +8509,75 @@ get_short_brop (int opcode)
 	return opcode;
 }
 
+static void
+interp_mark_ref_slots_for_vt (TransformData *td, int base_offset, MonoClass *klass)
+{
+	if (!m_class_has_references (klass) && !m_class_has_ref_fields (klass))
+		return;
+
+	gpointer iter = NULL;
+	MonoClassField *field;
+	while ((field = mono_class_get_fields_internal (klass, &iter))) {
+		MonoType *ftype = mono_field_get_type_internal (field);
+		if (ftype->attrs & FIELD_ATTRIBUTE_STATIC)
+			continue;
+		int offset = base_offset + m_field_get_offset (field) - MONO_ABI_SIZEOF (MonoObject);
+retry:
+		if (mini_type_is_reference (ftype) || ftype->type == MONO_TYPE_I || ftype->type == MONO_TYPE_U || m_type_is_byref (ftype)) {
+			int index = offset / sizeof (gpointer);
+			mono_bitset_set_fast (td->ref_slots, index);
+			if (td->verbose_level)
+				g_print ("Stack ref slot vt field at off %d\n", offset);
+		} else if (ftype->type == MONO_TYPE_VALUETYPE || ftype->type == MONO_TYPE_GENERICINST) {
+			interp_mark_ref_slots_for_vt (td, offset, mono_class_from_mono_type_internal (ftype));
+		}
+
+		if (m_class_is_inlinearray (klass)) {
+			int max_offset = base_offset + m_class_get_instance_size (klass) - MONO_ABI_SIZEOF (MonoObject);
+	                int align;
+			int field_size = mono_type_size (ftype, &align);
+			offset += field_size;
+			offset = ALIGN_TO (offset, align);
+			if (offset < max_offset)
+				goto retry;
+		}
+	}
+}
+
+void
+interp_mark_ref_slots_for_var (TransformData *td, int var)
+{
+	g_assert (td->vars [var].offset != -1);
+
+	gsize max_index = (td->vars [var].offset + td->vars [var].size) / sizeof (gpointer);
+
+	if (!td->ref_slots || max_index >= td->ref_slots->size) {
+		guint32 old_size = td->ref_slots ? (guint32)td->ref_slots->size : 0;
+		guint32 new_size = old_size ? old_size * 2 : 32;
+
+		gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (new_size, 0));
+		MonoBitSet *new_ref_slots = mono_bitset_mem_new (mem, new_size, 0);
+
+		if (old_size)
+			memcpy (&new_ref_slots->data, &td->ref_slots->data, old_size / 8);
+		td->ref_slots = new_ref_slots;
+	}
+
+	MonoType *type = td->vars [var].type;
+	if (td->vars [var].mt == MINT_TYPE_VT) {
+		MonoClass *klass = mono_class_from_mono_type_internal (type);
+		interp_mark_ref_slots_for_vt (td, td->vars [var].offset, klass);
+	} else {
+		// Managed pointers in interp are normally MONO_TYPE_I
+		if (mini_type_is_reference (type) || type->type == MONO_TYPE_I || type->type == MONO_TYPE_U || m_type_is_byref (type)) {
+			int index = td->vars [var].offset / sizeof (gpointer);
+			mono_bitset_set_fast (td->ref_slots, index);
+			if (td->verbose_level)
+				g_print ("Stack ref slot at off %d for var %d\n", index * sizeof (gpointer), var);
+		}
+	}
+}
+
 static int
 get_var_offset (TransformData *td, int var)
 {
@@ -8526,6 +8597,7 @@ get_var_offset (TransformData *td, int var)
 	g_assert (td->vars [var].execution_stack);
 
 	td->vars [var].offset = td->total_locals_size + td->vars [var].stack_offset;
+	interp_mark_ref_slots_for_var (td, var);
 	return td->vars [var].offset;
 }
 
@@ -9155,6 +9227,21 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG
 	mono_interp_register_imethod_data_items (rtm->data_items, td->imethod_items);
 	rtm->patchpoint_data = td->patchpoint_data;
 
+	if (td->ref_slots) {
+		gpointer ref_slots_mem = mono_mem_manager_alloc0 (td->mem_manager, mono_bitset_alloc_size (rtm->alloca_size / sizeof (gpointer), 0));
+		rtm->ref_slots = mono_bitset_mem_new (ref_slots_mem, rtm->alloca_size / sizeof (gpointer), 0);
+		gsize copy_size = rtm->ref_slots->size;
+		if (td->ref_slots->size < copy_size)
+			copy_size = td->ref_slots->size;
+		memcpy (&rtm->ref_slots->data, &td->ref_slots->data, copy_size / 8);
+		if (!td->optimized) {
+			// Unoptimized code can have some stack slot moving patterns as part of calls.
+			// Just conservatively mark all these slots as potentially containing refs.
+			for (guint32 offset = rtm->locals_size; offset < rtm->alloca_size; offset += sizeof (gpointer))
+				mono_bitset_set (rtm->ref_slots, offset / sizeof (gpointer));
+		}
+	}
+
 	/* Save debug info */
 	interp_save_debug_info (rtm, header, td, td->line_numbers);
 
diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h
@@ -340,6 +340,8 @@ typedef struct
 	int inline_depth;
 	int patchpoint_data_n;
 	int *patchpoint_data;
+	// This marks each stack slot offset that might contain refs throughout the execution of this method
+	MonoBitSet *ref_slots;
 	guint has_localloc : 1;
 	// If method compilation fails due to certain limits being exceeded, we disable inlining
 	// and retry compilation.
@@ -543,6 +545,8 @@ interp_foreach_ins_var (TransformData *td, InterpInst *ins, gpointer data, void
 void
 interp_foreach_ins_svar (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int*, gpointer));
 
+void
+interp_mark_ref_slots_for_var (TransformData *td, int var);
 
 /* Forward definitions for simd methods */
 static gboolean

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,9 @@ alloc_var_offset (TransformData td, int local, gint32 ptos)`
`32`	`32`	`int`
`33`	`33`	`interp_alloc_global_var_offset (TransformData *td, int var)`
`34`	`34`	`{`
`35`		`- return alloc_var_offset (td, var, &td->total_locals_size);`
	`35`	`+ int offset = alloc_var_offset (td, var, &td->total_locals_size);`
	`36`	`+ interp_mark_ref_slots_for_var (td, var);`
	`37`	`+ return offset;`
`36`	`38`	`}`
`37`	`39`
`38`	`40`	`static void`
`@@ -464,6 +466,8 @@ interp_alloc_offsets (TransformData *td)`
`464`	`466`	`add_active_call (td, &ac, td->vars [var].call);`
`465`	`467`	`} else if (!td->vars [var].global && td->vars [var].offset == -1) {`
`466`	`468`	`alloc_var_offset (td, var, &current_offset);`
	`469`	`+ interp_mark_ref_slots_for_var (td, var);`
	`470`	`+`
`467`	`471`	`if (current_offset > final_total_locals_size)`
`468`	`472`	`final_total_locals_size = current_offset;`
`469`	`473`
`@@ -492,6 +496,7 @@ interp_alloc_offsets (TransformData *td)`
`492`	`496`	`// These are allocated separately at the end of the stack`
`493`	`497`	`if (td->vars [i].call_args) {`
`494`	`498`	`td->vars [i].offset += td->param_area_offset;`
	`499`	`+ interp_mark_ref_slots_for_var (td, i);`
`495`	`500`	`final_total_locals_size = MAX (td->vars [i].offset + td->vars [i].size, final_total_locals_size);`
`496`	`501`	`}`
`497`	`502`	`}`