From 8d9bb2025e9fa88ea4f6605e60e936359b89377d Mon Sep 17 00:00:00 2001 From: Noah Falk Date: Sat, 13 Jul 2024 00:56:18 -0700 Subject: [PATCH] Add ee_alloc_context (NativeAOT) This change is some preparatory refactoring for the randomized allocation sampling feature. We need to add more state onto allocation context but we don't want to do a breaking change of the GC interface. The new state only needs to be visible to the EE but we want it physically near the existing alloc context state for good cache locality. To accomplish this we created a new ee_alloc_context struct which contains an instance of gc_alloc_context within it. The new ee_alloc_context::combined_limit should be used by fast allocation helpers to determine when to go down the slow path. Most of the time combined_limit has the same value as alloc_limit, but periodically we need to emit an allocation sampling event on an object that is somewhere in the middle of an AC. Using combined_limit rather than alloc_limit as the slow path trigger allows us to keep all the sampling event logic in the slow path. --- src/coreclr/nativeaot/Runtime/AsmOffsets.h | 21 ++++++----- .../nativeaot/Runtime/AsmOffsetsVerify.cpp | 2 +- src/coreclr/nativeaot/Runtime/DebugHeader.cpp | 5 ++- src/coreclr/nativeaot/Runtime/GCHelpers.cpp | 1 + .../nativeaot/Runtime/amd64/AllocFast.S | 6 +-- .../nativeaot/Runtime/amd64/AllocFast.asm | 6 +-- .../nativeaot/Runtime/amd64/AsmMacros.inc | 4 +- src/coreclr/nativeaot/Runtime/arm/AllocFast.S | 10 ++--- .../nativeaot/Runtime/arm64/AllocFast.S | 10 ++--- .../nativeaot/Runtime/arm64/AllocFast.asm | 6 +-- .../nativeaot/Runtime/arm64/AsmMacros.h | 4 +- src/coreclr/nativeaot/Runtime/gcenv.ee.cpp | 4 +- .../nativeaot/Runtime/i386/AllocFast.asm | 6 +-- .../nativeaot/Runtime/i386/AsmMacros.inc | 4 +- src/coreclr/nativeaot/Runtime/inc/rhbinder.h | 10 ++--- .../nativeaot/Runtime/loongarch64/AllocFast.S | 10 ++--- src/coreclr/nativeaot/Runtime/portable.cpp | 25 ++++++++----- src/coreclr/nativeaot/Runtime/thread.cpp | 1 + src/coreclr/nativeaot/Runtime/thread.h | 37 ++++++++++++++++++- src/coreclr/nativeaot/Runtime/thread.inl | 37 ++++++++++++++++++- .../Runtime/unix/unixasmmacrosamd64.inc | 4 +- .../Runtime/unix/unixasmmacrosarm.inc | 4 +- 22 files changed, 151 insertions(+), 66 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/AsmOffsets.h b/src/coreclr/nativeaot/Runtime/AsmOffsets.h index 32abd406175e76..cb6bf8842e04b9 100644 --- a/src/coreclr/nativeaot/Runtime/AsmOffsets.h +++ b/src/coreclr/nativeaot/Runtime/AsmOffsets.h @@ -46,21 +46,24 @@ ASM_OFFSET( 0, 0, MethodTable, m_uFlags) ASM_OFFSET( 4, 4, MethodTable, m_uBaseSize) ASM_OFFSET( 14, 18, MethodTable, m_VTable) -ASM_OFFSET( 0, 0, Thread, m_rgbAllocContextBuffer) -ASM_OFFSET( 28, 38, Thread, m_ThreadStateFlags) -ASM_OFFSET( 2c, 40, Thread, m_pTransitionFrame) -ASM_OFFSET( 30, 48, Thread, m_pDeferredTransitionFrame) -ASM_OFFSET( 40, 68, Thread, m_ppvHijackedReturnAddressLocation) -ASM_OFFSET( 44, 70, Thread, m_pvHijackedReturnAddress) -ASM_OFFSET( 48, 78, Thread, m_uHijackedReturnValueFlags) -ASM_OFFSET( 4c, 80, Thread, m_pExInfoStackHead) -ASM_OFFSET( 50, 88, Thread, m_threadAbortException) +ASM_OFFSET( 0, 0, Thread, m_eeAllocContext) +ASM_OFFSET( 2c, 40, Thread, m_ThreadStateFlags) +ASM_OFFSET( 30, 48, Thread, m_pTransitionFrame) +ASM_OFFSET( 34, 50, Thread, m_pDeferredTransitionFrame) +ASM_OFFSET( 44, 70, Thread, m_ppvHijackedReturnAddressLocation) +ASM_OFFSET( 48, 78, Thread, m_pvHijackedReturnAddress) +ASM_OFFSET( 4c, 80, Thread, m_uHijackedReturnValueFlags) +ASM_OFFSET( 50, 88, Thread, m_pExInfoStackHead) +ASM_OFFSET( 54, 90, Thread, m_threadAbortException) ASM_SIZEOF( 14, 20, EHEnum) ASM_OFFSET( 0, 0, gc_alloc_context, alloc_ptr) ASM_OFFSET( 4, 8, gc_alloc_context, alloc_limit) +ASM_OFFSET( 0, 0, ee_alloc_context, combined_limit) +ASM_OFFSET( 4, 8, ee_alloc_context, m_rgbAllocContextBuffer) + #ifdef FEATURE_CACHED_INTERFACE_DISPATCH ASM_OFFSET( 4, 8, InterfaceDispatchCell, m_pCache) #ifdef INTERFACE_DISPATCH_CACHE_HAS_CELL_BACKPOINTER diff --git a/src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp b/src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp index b5520d739e871b..d27884dbdf1ff3 100644 --- a/src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp +++ b/src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp @@ -22,7 +22,7 @@ class AsmOffsets { - static_assert(sizeof(Thread::m_rgbAllocContextBuffer) >= sizeof(gc_alloc_context), "Thread::m_rgbAllocContextBuffer is not big enough to hold a gc_alloc_context"); + static_assert(sizeof(ee_alloc_context::m_rgbAllocContextBuffer) >= sizeof(gc_alloc_context), "ee_alloc_context::m_rgbAllocContextBuffer is not big enough to hold a gc_alloc_context"); // Some assembly helpers for arrays and strings are shared and use the fact that arrays and strings have similar layouts) static_assert(offsetof(Array, m_Length) == offsetof(String, m_Length), "The length field of String and Array have different offsets"); diff --git a/src/coreclr/nativeaot/Runtime/DebugHeader.cpp b/src/coreclr/nativeaot/Runtime/DebugHeader.cpp index 324e0f86f2aea8..e32956dde4ee28 100644 --- a/src/coreclr/nativeaot/Runtime/DebugHeader.cpp +++ b/src/coreclr/nativeaot/Runtime/DebugHeader.cpp @@ -163,6 +163,9 @@ extern "C" void PopulateDebugHeaders() MAKE_DEBUG_FIELD_ENTRY(dac_gc_heap, finalize_queue); MAKE_DEBUG_FIELD_ENTRY(dac_gc_heap, generation_table); + MAKE_SIZE_ENTRY(ee_alloc_context); + MAKE_DEBUG_FIELD_ENTRY(ee_alloc_context, m_rgbAllocContextBuffer); + MAKE_SIZE_ENTRY(gc_alloc_context); MAKE_DEBUG_FIELD_ENTRY(gc_alloc_context, alloc_ptr); MAKE_DEBUG_FIELD_ENTRY(gc_alloc_context, alloc_limit); @@ -194,7 +197,7 @@ extern "C" void PopulateDebugHeaders() MAKE_SIZE_ENTRY(RuntimeThreadLocals); MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_pNext); - MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_rgbAllocContextBuffer); + MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_eeAllocContext); MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_threadId); MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_pThreadStressLog); MAKE_DEBUG_FIELD_ENTRY(RuntimeThreadLocals, m_pExInfoStackHead); diff --git a/src/coreclr/nativeaot/Runtime/GCHelpers.cpp b/src/coreclr/nativeaot/Runtime/GCHelpers.cpp index b038d9d33541bd..41b8aa8463c2ea 100644 --- a/src/coreclr/nativeaot/Runtime/GCHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/GCHelpers.cpp @@ -540,6 +540,7 @@ static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t tls_pLastAllocationEEType = pEEType; Object* pObject = GCHeapUtilities::GetGCHeap()->Alloc(pThread->GetAllocContext(), cbSize, uFlags); + pThread->GetEEAllocContext()->UpdateCombinedLimit(); if (pObject == NULL) return NULL; diff --git a/src/coreclr/nativeaot/Runtime/amd64/AllocFast.S b/src/coreclr/nativeaot/Runtime/amd64/AllocFast.S index 6cb85bcc507a09..8923a7a4fbb64b 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AllocFast.S +++ b/src/coreclr/nativeaot/Runtime/amd64/AllocFast.S @@ -28,7 +28,7 @@ NESTED_ENTRY RhpNewFast, _TEXT, NoHandler mov rsi, [rax + OFFSETOF__Thread__m_alloc_context__alloc_ptr] add rdx, rsi - cmp rdx, [rax + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp rdx, [rax + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja LOCAL_LABEL(RhpNewFast_RarePath) // set the new alloc pointer @@ -143,7 +143,7 @@ NESTED_ENTRY RhNewString, _TEXT, NoHandler // rcx == Thread* // rdx == string size // r12 == element count - cmp rax, [rcx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp rax, [rcx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja LOCAL_LABEL(RhNewString_RarePath) mov [rcx + OFFSETOF__Thread__m_alloc_context__alloc_ptr], rax @@ -226,7 +226,7 @@ NESTED_ENTRY RhpNewArray, _TEXT, NoHandler // rcx == Thread* // rdx == array size // r12 == element count - cmp rax, [rcx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp rax, [rcx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja LOCAL_LABEL(RhpNewArray_RarePath) mov [rcx + OFFSETOF__Thread__m_alloc_context__alloc_ptr], rax diff --git a/src/coreclr/nativeaot/Runtime/amd64/AllocFast.asm b/src/coreclr/nativeaot/Runtime/amd64/AllocFast.asm index 37be558c3cef1d..6ba69c0c141274 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AllocFast.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/AllocFast.asm @@ -25,7 +25,7 @@ LEAF_ENTRY RhpNewFast, _TEXT mov rax, [rdx + OFFSETOF__Thread__m_alloc_context__alloc_ptr] add r8, rax - cmp r8, [rdx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp r8, [rdx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja RhpNewFast_RarePath ;; set the new alloc pointer @@ -118,7 +118,7 @@ LEAF_ENTRY RhNewString, _TEXT ; rdx == element count ; r8 == array size ; r10 == thread - cmp rax, [r10 + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp rax, [r10 + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja RhpNewArrayRare mov [r10 + OFFSETOF__Thread__m_alloc_context__alloc_ptr], rax @@ -179,7 +179,7 @@ LEAF_ENTRY RhpNewArray, _TEXT ; rdx == element count ; r8 == array size ; r10 == thread - cmp rax, [r10 + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp rax, [r10 + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja RhpNewArrayRare mov [r10 + OFFSETOF__Thread__m_alloc_context__alloc_ptr], rax diff --git a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc index 33089b6643d382..41c43252317d9a 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc +++ b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc @@ -336,8 +336,8 @@ TSF_DoNotTriggerGc equ 10h ;; ;; Rename fields of nested structs ;; -OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -OFFSETOF__Thread__m_alloc_context__alloc_limit equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +OFFSETOF__Thread__m_eeAllocContext__combined_limit equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit diff --git a/src/coreclr/nativeaot/Runtime/arm/AllocFast.S b/src/coreclr/nativeaot/Runtime/arm/AllocFast.S index 31b54d1bca313a..76091303696546 100644 --- a/src/coreclr/nativeaot/Runtime/arm/AllocFast.S +++ b/src/coreclr/nativeaot/Runtime/arm/AllocFast.S @@ -26,7 +26,7 @@ LEAF_ENTRY RhpNewFast, _TEXT ldr r3, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_ptr] add r2, r3 - ldr r1, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr r1, [r0, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp r2, r1 bhi LOCAL_LABEL(RhpNewFast_RarePath) @@ -132,7 +132,7 @@ LEAF_ENTRY RhNewString, _TEXT adds r6, r12 bcs LOCAL_LABEL(RhNewString_RarePath) // if we get a carry here, the string is too large to fit below 4 GB - ldr r12, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr r12, [r0, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp r6, r12 bhi LOCAL_LABEL(RhNewString_RarePath) @@ -213,7 +213,7 @@ LOCAL_LABEL(ArrayAlignSize): adds r6, r12 bcs LOCAL_LABEL(RhpNewArray_RarePath) // if we get a carry here, the array is too large to fit below 4 GB - ldr r12, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr r12, [r0, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp r6, r12 bhi LOCAL_LABEL(RhpNewArray_RarePath) @@ -349,7 +349,7 @@ LEAF_ENTRY RhpNewFastAlign8, _TEXT // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add r2, r3 - ldr r3, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr r3, [r0, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp r2, r3 bhi LOCAL_LABEL(Alloc8Failed) @@ -412,7 +412,7 @@ LEAF_ENTRY RhpNewFastMisalign, _TEXT // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add r2, r3 - ldr r3, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr r3, [r0, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp r2, r3 bhi LOCAL_LABEL(BoxAlloc8Failed) diff --git a/src/coreclr/nativeaot/Runtime/arm64/AllocFast.S b/src/coreclr/nativeaot/Runtime/arm64/AllocFast.S index 966b052a2b9f9e..ebe5387d8d9306 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/AllocFast.S +++ b/src/coreclr/nativeaot/Runtime/arm64/AllocFast.S @@ -10,8 +10,8 @@ GC_ALLOC_FINALIZE = 1 // // Rename fields of nested structs // -OFFSETOF__Thread__m_alloc_context__alloc_ptr = OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -OFFSETOF__Thread__m_alloc_context__alloc_limit = OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +OFFSETOF__Thread__m_alloc_context__alloc_ptr = OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +OFFSETOF__Thread__m_eeAllocContext__combined_limit = OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit @@ -44,7 +44,7 @@ OFFSETOF__Thread__m_alloc_context__alloc_limit = OFFSETOF__Thread__m_rgbAll // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x13, [x1, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x13, [x1, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x13 bhi LOCAL_LABEL(RhpNewFast_RarePath) @@ -139,7 +139,7 @@ LOCAL_LABEL(NewOutOfMemory): // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x12, [x3, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x12 bhi LOCAL_LABEL(RhNewString_Rare) @@ -207,7 +207,7 @@ LOCAL_LABEL(RhNewString_Rare): // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x12, [x3, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x12 bhi LOCAL_LABEL(RhpNewArray_Rare) diff --git a/src/coreclr/nativeaot/Runtime/arm64/AllocFast.asm b/src/coreclr/nativeaot/Runtime/arm64/AllocFast.asm index e6849b87312669..d8e506335d77f2 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/AllocFast.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/AllocFast.asm @@ -30,7 +30,7 @@ ;; Determine whether the end of the object would lie outside of the current allocation context. If so, ;; we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x13, [x1, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x13, [x1, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x13 bhi RhpNewFast_RarePath @@ -118,7 +118,7 @@ NewOutOfMemory ;; Determine whether the end of the object would lie outside of the current allocation context. If so, ;; we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x12, [x3, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x12 bhi RhpNewArrayRare @@ -179,7 +179,7 @@ StringSizeOverflow ;; Determine whether the end of the object would lie outside of the current allocation context. If so, ;; we abandon the attempt to allocate the object directly and fall back to the slow helper. add x2, x2, x12 - ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_limit] + ldr x12, [x3, #OFFSETOF__Thread__m_eeAllocContext__combined_limit] cmp x2, x12 bhi RhpNewArrayRare diff --git a/src/coreclr/nativeaot/Runtime/arm64/AsmMacros.h b/src/coreclr/nativeaot/Runtime/arm64/AsmMacros.h index 94a559df719e02..2f6e83e2cf9b66 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/AsmMacros.h +++ b/src/coreclr/nativeaot/Runtime/arm64/AsmMacros.h @@ -87,8 +87,8 @@ STATUS_REDHAWK_THREAD_ABORT equ 0x43 ;; ;; Rename fields of nested structs ;; -OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -OFFSETOF__Thread__m_alloc_context__alloc_limit equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +OFFSETOF__Thread__m_eeAllocContext__combined_limit equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit ;; ;; IMPORTS diff --git a/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp b/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp index f041e499c11d4b..fcaa52db945177 100644 --- a/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp +++ b/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp @@ -136,7 +136,9 @@ void GCToEEInterface::GcEnumAllocContexts(enum_alloc_context_func* fn, void* par { FOREACH_THREAD(thread) { - (*fn) (thread->GetAllocContext(), param); + ee_alloc_context* alloc_context = thread->GetEEAllocContext(); + (*fn) (alloc_context->GetGCAllocContext(), param); + alloc_context->UpdateCombinedLimit(); } END_FOREACH_THREAD } diff --git a/src/coreclr/nativeaot/Runtime/i386/AllocFast.asm b/src/coreclr/nativeaot/Runtime/i386/AllocFast.asm index 8d28e94c944177..d557f5ec750774 100644 --- a/src/coreclr/nativeaot/Runtime/i386/AllocFast.asm +++ b/src/coreclr/nativeaot/Runtime/i386/AllocFast.asm @@ -29,7 +29,7 @@ FASTCALL_FUNC RhpNewFast, 4 ;; add eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_ptr] - cmp eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp eax, [edx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja AllocFailed ;; set the new alloc pointer @@ -165,7 +165,7 @@ FASTCALL_FUNC RhNewString, 8 mov ecx, eax add eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_ptr] jc StringAllocContextOverflow - cmp eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp eax, [edx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja StringAllocContextOverflow ; ECX == allocation size @@ -282,7 +282,7 @@ ArrayAlignSize: mov ecx, eax add eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_ptr] jc ArrayAllocContextOverflow - cmp eax, [edx + OFFSETOF__Thread__m_alloc_context__alloc_limit] + cmp eax, [edx + OFFSETOF__Thread__m_eeAllocContext__combined_limit] ja ArrayAllocContextOverflow ; ECX == array size diff --git a/src/coreclr/nativeaot/Runtime/i386/AsmMacros.inc b/src/coreclr/nativeaot/Runtime/i386/AsmMacros.inc index 896bf8e67dab53..9541f73940215a 100644 --- a/src/coreclr/nativeaot/Runtime/i386/AsmMacros.inc +++ b/src/coreclr/nativeaot/Runtime/i386/AsmMacros.inc @@ -140,8 +140,8 @@ STATUS_REDHAWK_THREAD_ABORT equ 43h ;; ;; Rename fields of nested structs ;; -OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -OFFSETOF__Thread__m_alloc_context__alloc_limit equ OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +OFFSETOF__Thread__m_alloc_context__alloc_ptr equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +OFFSETOF__Thread__m_eeAllocContext__combined_limit equ OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit ;; ;; CONSTANTS -- SYMBOLS diff --git a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h index db238e24acbc16..6cf67845d86d30 100644 --- a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h +++ b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h @@ -496,15 +496,15 @@ struct PInvokeTransitionFrame #define PInvokeTransitionFrame_MAX_SIZE (sizeof(PInvokeTransitionFrame) + (POINTER_SIZE * PInvokeTransitionFrame_SaveRegs_count)) #ifdef TARGET_AMD64 -#define OFFSETOF__Thread__m_pTransitionFrame 0x40 +#define OFFSETOF__Thread__m_pTransitionFrame 0x48 #elif defined(TARGET_ARM64) -#define OFFSETOF__Thread__m_pTransitionFrame 0x40 +#define OFFSETOF__Thread__m_pTransitionFrame 0x48 #elif defined(TARGET_LOONGARCH64) -#define OFFSETOF__Thread__m_pTransitionFrame 0x40 +#define OFFSETOF__Thread__m_pTransitionFrame 0x48 #elif defined(TARGET_X86) -#define OFFSETOF__Thread__m_pTransitionFrame 0x2c +#define OFFSETOF__Thread__m_pTransitionFrame 0x30 #elif defined(TARGET_ARM) -#define OFFSETOF__Thread__m_pTransitionFrame 0x2c +#define OFFSETOF__Thread__m_pTransitionFrame 0x30 #endif typedef DPTR(MethodTable) PTR_EEType; diff --git a/src/coreclr/nativeaot/Runtime/loongarch64/AllocFast.S b/src/coreclr/nativeaot/Runtime/loongarch64/AllocFast.S index dc344183e927ba..5f03faa6938490 100644 --- a/src/coreclr/nativeaot/Runtime/loongarch64/AllocFast.S +++ b/src/coreclr/nativeaot/Runtime/loongarch64/AllocFast.S @@ -10,8 +10,8 @@ GC_ALLOC_FINALIZE = 1 // // Rename fields of nested structs // -OFFSETOF__Thread__m_alloc_context__alloc_ptr = OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -OFFSETOF__Thread__m_alloc_context__alloc_limit = OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +OFFSETOF__Thread__m_alloc_context__alloc_ptr = OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +OFFSETOF__Thread__m_eeAllocContext__combined_limit = OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit @@ -44,7 +44,7 @@ OFFSETOF__Thread__m_alloc_context__alloc_limit = OFFSETOF__Thread__m_rgbAll // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add.d $a2, $a2, $t3 - ld.d $t4, $a1, OFFSETOF__Thread__m_alloc_context__alloc_limit + ld.d $t4, $a1, OFFSETOF__Thread__m_eeAllocContext__combined_limit bltu $t4, $a2, RhpNewFast_RarePath // Update the alloc pointer to account for the allocation. @@ -137,7 +137,7 @@ NewOutOfMemory: // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add.d $a2, $a2, $t3 - ld.d $t3, $a3, OFFSETOF__Thread__m_alloc_context__alloc_limit + ld.d $t3, $a3, OFFSETOF__Thread__m_eeAllocContext__combined_limit bltu $t3, $a2, RhNewString_Rare // Reload new object address into r12. @@ -199,7 +199,7 @@ RhNewString_Rare: // Determine whether the end of the object would lie outside of the current allocation context. If so, // we abandon the attempt to allocate the object directly and fall back to the slow helper. add.d $a2, $a2, $t3 - ld.d $t3, $a3, OFFSETOF__Thread__m_alloc_context__alloc_limit + ld.d $t3, $a3, OFFSETOF__Thread__m_eeAllocContext__combined_limit bltu $t3, $a2, RhpNewArray_Rare // Reload new object address into t3. diff --git a/src/coreclr/nativeaot/Runtime/portable.cpp b/src/coreclr/nativeaot/Runtime/portable.cpp index 318a10fd20a526..d42a6ecc67efe8 100644 --- a/src/coreclr/nativeaot/Runtime/portable.cpp +++ b/src/coreclr/nativeaot/Runtime/portable.cpp @@ -64,8 +64,9 @@ FCIMPL1(Object *, RhpNewFast, MethodTable* pEEType) size_t size = pEEType->GetBaseSize(); uint8_t* alloc_ptr = acontext->alloc_ptr; - ASSERT(alloc_ptr <= acontext->alloc_limit); - if ((size_t)(acontext->alloc_limit - alloc_ptr) >= size) + uint8_t* combined_limit = pCurThread->GetEEAllocContext()->GetCombinedLimit(); + ASSERT(alloc_ptr <= combined_limit); + if ((size_t)(combined_limit - alloc_ptr) >= size) { acontext->alloc_ptr = alloc_ptr + size; Object* pObject = (Object *)alloc_ptr; @@ -112,8 +113,9 @@ FCIMPL2(Array *, RhpNewArray, MethodTable * pArrayEEType, int numElements) size = ALIGN_UP(size, sizeof(uintptr_t)); uint8_t* alloc_ptr = acontext->alloc_ptr; - ASSERT(alloc_ptr <= acontext->alloc_limit); - if ((size_t)(acontext->alloc_limit - alloc_ptr) >= size) + uint8_t* combined_limit = pCurThread->GetEEAllocContext()->GetCombinedLimit(); + ASSERT(alloc_ptr <= combined_limit); + if ((size_t)(combined_limit - alloc_ptr) >= size) { acontext->alloc_ptr = alloc_ptr + size; Array* pObject = (Array*)alloc_ptr; @@ -165,8 +167,9 @@ FCIMPL1(Object*, RhpNewFastAlign8, MethodTable* pEEType) paddedSize += 12; } - ASSERT(alloc_ptr <= acontext->alloc_limit); - if ((size_t)(acontext->alloc_limit - alloc_ptr) >= paddedSize) + uint8_t* combined_limit = pCurThread->GetEEAllocContext()->GetCombinedLimit(); + ASSERT(alloc_ptr <= combined_limit); + if ((size_t)(combined_limit - alloc_ptr) >= paddedSize) { acontext->alloc_ptr = alloc_ptr + paddedSize; if (requiresPadding) @@ -199,8 +202,9 @@ FCIMPL1(Object*, RhpNewFastMisalign, MethodTable* pEEType) paddedSize += 12; } - ASSERT(alloc_ptr <= acontext->alloc_limit); - if ((size_t)(acontext->alloc_limit - alloc_ptr) >= paddedSize) + uint8_t* combined_limit = pCurThread->GetEEAllocContext()->GetCombinedLimit(); + ASSERT(alloc_ptr <= combined_limit); + if ((size_t)(combined_limit - alloc_ptr) >= paddedSize) { acontext->alloc_ptr = alloc_ptr + paddedSize; if (requiresPadding) @@ -248,8 +252,9 @@ FCIMPL2(Array*, RhpNewArrayAlign8, MethodTable* pArrayEEType, int numElements) paddedSize += 12; } - ASSERT(alloc_ptr <= acontext->alloc_limit); - if ((size_t)(acontext->alloc_limit - alloc_ptr) >= paddedSize) + uint8_t* combined_limit = pCurThread->GetEEAllocContext()->GetCombinedLimit(); + ASSERT(alloc_ptr <= combined_limit); + if ((size_t)(combined_limit - alloc_ptr) >= paddedSize) { acontext->alloc_ptr = alloc_ptr + paddedSize; if (requiresAlignObject) diff --git a/src/coreclr/nativeaot/Runtime/thread.cpp b/src/coreclr/nativeaot/Runtime/thread.cpp index b796b052182260..ccc76b5df5cfef 100644 --- a/src/coreclr/nativeaot/Runtime/thread.cpp +++ b/src/coreclr/nativeaot/Runtime/thread.cpp @@ -353,6 +353,7 @@ void Thread::Detach() gc_alloc_context* context = GetAllocContext(); s_DeadThreadsNonAllocBytes += context->alloc_limit - context->alloc_ptr; GCHeapUtilities::GetGCHeap()->FixAllocContext(context, NULL, NULL); + GetEEAllocContext()->UpdateCombinedLimit(); SetDetached(); } diff --git a/src/coreclr/nativeaot/Runtime/thread.h b/src/coreclr/nativeaot/Runtime/thread.h index 4c0a21e9f9ab7f..70f776de2ee9a1 100644 --- a/src/coreclr/nativeaot/Runtime/thread.h +++ b/src/coreclr/nativeaot/Runtime/thread.h @@ -83,9 +83,43 @@ struct InlinedThreadStaticRoot TypeManager* m_typeManager; }; +// This struct allows adding some state that is only visible to the EE onto the standard gc_alloc_context +struct ee_alloc_context +{ + // Any allocation that would overlap combined_limit needs to be handled by the allocation slow path. + // combined_limit is the minimum of: + // - gc_alloc_context.alloc_limit (the end of the current AC) + // - the sampling_limit + // + // In the simple case that randomized sampling is disabled, combined_limit is always equal to alloc_limit. + // + // There are two different useful interpretations for the sampling_limit. One is to treat the sampling_limit + // as an address and when we allocate an object that overlaps that address we should emit a sampling event. + // The other is that we can treat (sampling_limit - alloc_ptr) as a budget of how many bytes we can allocate + // before emitting a sampling event. If we always allocated objects contiguously in the AC and incremented + // alloc_ptr by the size of the object, these two interpretations would be equivalent. However, when objects + // don't fit in the AC we allocate them in some other address range. The budget interpretation is more + // flexible to handle those cases. + // + // The sampling limit isn't stored in any separate field explicitly, instead it is implied: + // - if combined_limit == alloc_limit there is no sampled byte in the AC. In the budget interpretation + // we can allocate (alloc_limit - alloc_ptr) unsampled bytes. We'll need a new random number after + // that to determine whether future allocated bytes should be sampled. + // This occurs either because the sampling feature is disabled, or because the randomized selection + // of sampled bytes didn't select a byte in this AC. + // - if combined_limit < alloc_limit there is a sample limit in the AC. sample_limit = combined_limit. + uint8_t* combined_limit; + uint8_t m_rgbAllocContextBuffer[SIZEOF_ALLOC_CONTEXT]; + + gc_alloc_context* GetGCAllocContext(); + uint8_t* GetCombinedLimit(); + void UpdateCombinedLimit(); +}; + + struct RuntimeThreadLocals { - uint8_t m_rgbAllocContextBuffer[SIZEOF_ALLOC_CONTEXT]; + ee_alloc_context m_eeAllocContext; uint32_t volatile m_ThreadStateFlags; // see Thread::ThreadStateFlags enum PInvokeTransitionFrame* m_pTransitionFrame; PInvokeTransitionFrame* m_pDeferredTransitionFrame; // see Thread::EnablePreemptiveMode @@ -215,6 +249,7 @@ class Thread : private RuntimeThreadLocals bool IsInitialized(); + ee_alloc_context * GetEEAllocContext(); gc_alloc_context * GetAllocContext(); uint64_t GetPalThreadIdForLogging(); diff --git a/src/coreclr/nativeaot/Runtime/thread.inl b/src/coreclr/nativeaot/Runtime/thread.inl index 2daffd06922134..f63810e999f09a 100644 --- a/src/coreclr/nativeaot/Runtime/thread.inl +++ b/src/coreclr/nativeaot/Runtime/thread.inl @@ -2,6 +2,36 @@ // The .NET Foundation licenses this file to you under the MIT license. #ifndef DACCESS_COMPILE + + + +inline gc_alloc_context* ee_alloc_context::GetGCAllocContext() +{ + return (gc_alloc_context*)&m_rgbAllocContextBuffer; +} + +inline uint8_t* ee_alloc_context::GetCombinedLimit() +{ + return combined_limit; +} + +// It seems like there is a desire not to include a definition of gc_alloc_context in a more global place within +// the NativeAOT runtime? Instead some individual files include their own definition as needed and others reference +// gcinterface.h to get the official definition. This .inl file gets included from multiple places some of which +// do define the type and others that do not. To avoid getting a redefinition error I added this private definition. +struct _thread_inl_gc_alloc_context +{ + uint8_t* alloc_ptr; + uint8_t* alloc_limit; +}; + +void ee_alloc_context::UpdateCombinedLimit() +{ + // The randomized allocation sampling feature is being submitted in stages. For now sampling is never enabled so + // combined_limit is always the same as alloc_limit. + combined_limit = ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_limit; +} + // Set the m_pDeferredTransitionFrame field for GC allocation helpers that setup transition frame // in assembly code. Do not use anywhere else. inline void Thread::SetDeferredTransitionFrame(PInvokeTransitionFrame* pTransitionFrame) @@ -59,9 +89,14 @@ inline void Thread::PopGCFrameRegistration(GCFrameRegistration* pRegistration) m_pGCFrameRegistrations = pRegistration->m_pNext; } +inline ee_alloc_context* Thread::GetEEAllocContext() +{ + return &m_eeAllocContext; +} + inline gc_alloc_context* Thread::GetAllocContext() { - return (gc_alloc_context*)m_rgbAllocContextBuffer; + return GetEEAllocContext()->GetGCAllocContext(); } inline bool Thread::IsStateSet(ThreadStateFlags flags) diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc index f8ec8f5037b1b2..b4c0a74d45509b 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc @@ -240,8 +240,8 @@ C_FUNC(\Name): // // Rename fields of nested structs // -#define OFFSETOF__Thread__m_alloc_context__alloc_ptr OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr -#define OFFSETOF__Thread__m_alloc_context__alloc_limit OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit +#define OFFSETOF__Thread__m_alloc_context__alloc_ptr OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr +#define OFFSETOF__Thread__m_eeAllocContext__combined_limit OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit // GC type flags #define GC_ALLOC_FINALIZE 1 diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc index 68631819f7dee4..4ccd38b19c7bef 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc @@ -28,8 +28,8 @@ #define TrapThreadsFlags_TrapThreads 2 // Rename fields of nested structs -#define OFFSETOF__Thread__m_alloc_context__alloc_ptr (OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr) -#define OFFSETOF__Thread__m_alloc_context__alloc_limit (OFFSETOF__Thread__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_limit) +#define OFFSETOF__Thread__m_alloc_context__alloc_ptr (OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__m_rgbAllocContextBuffer + OFFSETOF__gc_alloc_context__alloc_ptr) +#define OFFSETOF__Thread__m_eeAllocContext__combined_limit (OFFSETOF__Thread__m_eeAllocContext + OFFSETOF__ee_alloc_context__combined_limit) // GC minimal sized object. We use this to switch between 4 and 8 byte alignment in the GC heap (see AllocFast.asm). #define SIZEOF__MinObject 12