Skip to content

Commit

Permalink
Android & ARM64: FEATURE_EMULATED_TLS (#8323)
Browse files Browse the repository at this point in the history
Android on ARM64 doesn't follow the recommended way how to implement Thread Local Storage (TLS). Instead they are using a custom solution when a __thread variable is used. This solution requires that internally always a C function needs to be called when a TLS variable is access. While the compiler and linker take care of all the c/c++ code the helper code written in assembler needs to be modified. 

This implements the changes for the current thread and thunks data TLS variable. It would work on systems that follow the recommended implementation, too. But as it has a larger overhead compared to the current inlined solution it needs to be activated for systems with this issue by using FEATURE_EMULATED_TLS
  • Loading branch information
RalfKornmannEnvision authored and jkotas committed Sep 23, 2020
1 parent 619784c commit 6d9f01c
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 94 deletions.
4 changes: 0 additions & 4 deletions src/coreclr/src/nativeaot/Runtime/ThunksMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,6 @@ EXTERN_C REDHAWK_API void* __cdecl RhAllocateThunksMapping()

// FEATURE_RX_THUNKS
#elif FEATURE_FIXED_POOL_THUNKS

// This thread local variable is used for delegate marshalling
DECLSPEC_THREAD intptr_t tls_thunkData;

// This is used by the thunk code to find the stub data for the called thunk slot
extern "C" uintptr_t g_pThunkStubData;
uintptr_t g_pThunkStubData = NULL;
Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/src/nativeaot/Runtime/arm64/AllocFast.S
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ OFFSETOF__Thread__m_alloc_context__alloc_limit = OFFSETOF__Thread__m_rgbAll
LEAF_ENTRY RhpNewFast, _TEXT

// x1 = GetThread()
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_1
#else
INLINE_GETTHREAD x1
#endif

//
// x0 contains EEType pointer
Expand Down Expand Up @@ -135,7 +139,11 @@ NewOutOfMemory:
// x1 == element count
// x2 == string size

#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_3
#else
INLINE_GETTHREAD x3
#endif

// Load potential new object address into x12.
ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
Expand Down Expand Up @@ -196,7 +204,11 @@ StringSizeOverflow:
// x1 == element count
// x2 == array size

#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_3
#else
INLINE_GETTHREAD x3
#endif

// Load potential new object address into x12.
ldr x12, [x3, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
Expand Down
36 changes: 32 additions & 4 deletions src/coreclr/src/nativeaot/Runtime/arm64/ExceptionHandling.S
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,11 @@
ALLOC_THROW_FRAME HARDWARE_EXCEPTION

// x2 = GetThread()
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_2
#else
INLINE_GETTHREAD x2
#endif

add x1, sp, #rsp_offsetof_ExInfo // x1 <- ExInfo*
str xzr, [x1, #OFFSETOF__ExInfo__m_exception] // pExInfo->m_exception = null
Expand Down Expand Up @@ -259,7 +263,11 @@
ALLOC_THROW_FRAME SOFTWARE_EXCEPTION

// x2 = GetThread()
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_2
#else
INLINE_GETTHREAD x2
#endif

// There is runtime C# code that can tail call to RhpThrowEx using a binder intrinsic. So the return
// address could have been hijacked when we were in that C# code and we must remove the hijack and
Expand Down Expand Up @@ -349,7 +357,11 @@ NotHijacked:
ALLOC_THROW_FRAME SOFTWARE_EXCEPTION

// x2 = GetThread()
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_2
#else
INLINE_GETTHREAD x2
#endif

add x1, sp, #rsp_offsetof_ExInfo // x1 <- ExInfo*
str xzr, [x1, #OFFSETOF__ExInfo__m_exception] // pExInfo->m_exception = null
Expand Down Expand Up @@ -408,7 +420,11 @@ NotHijacked:
//
// clear the DoNotTriggerGc flag, trashes x4-x6
//
INLINE_GETTHREAD x5 // x5 <- Thread*
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_5
#else
INLINE_GETTHREAD x5
#endif

ldr x4, [x5, #OFFSETOF__Thread__m_threadAbortException]
sub x4, x4, x0
Expand Down Expand Up @@ -447,7 +463,11 @@ ClearSuccess_Catch:

// @TODO: add debug-only validation code for ExInfo pop

INLINE_GETTHREAD x1 // x1 <- Thread*
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_1
#else
INLINE_GETTHREAD x1
#endif

// We must unhijack the thread at this point because the section of stack where the hijack is applied
// may go dead. If it does, then the next time we try to unhijack the thread, it will corrupt the stack.
Expand Down Expand Up @@ -517,7 +537,11 @@ NoAbort:
//
// clear the DoNotTriggerGc flag, trashes x2-x4
//
INLINE_GETTHREAD x2 // x2 <- Thread*
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_2
#else
INLINE_GETTHREAD x2
#endif

add x12, x2, #OFFSETOF__Thread__m_ThreadStateFlags

Expand Down Expand Up @@ -555,7 +579,11 @@ ClearSuccess:
//
// set the DoNotTriggerGc flag, trashes x1-x3
//
INLINE_GETTHREAD x2 // x2 <- Thread*
#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_2
#else
INLINE_GETTHREAD x2
#endif

add x12, x2, #OFFSETOF__Thread__m_ThreadStateFlags
SetRetry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@ POINTER_SIZE = 0x08
// Custom calling convention:
// xip0 pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers)

#ifdef FEATURE_EMULATED_TLS
// This doesn't save and restore the floating point argument registers. If we encounter a
// target system that uses TLS emulation and modify these registers during this call we
// need to save and restore them, too
GETTHUNKDATA_ETLS_9
#else
INLINE_GET_TLS_VAR x9, tls_thunkData
#endif

// x9 = base address of TLS data
// xip0 = address of context cell in thunk's data
Expand All @@ -45,6 +52,7 @@ POINTER_SIZE = 0x08
LEAF_END RhGetCommonStubAddress, _TEXT


#ifndef FEATURE_EMULATED_TLS
//
// IntPtr RhGetCurrentThunkContext()
//
Expand All @@ -57,3 +65,4 @@ POINTER_SIZE = 0x08
ret

LEAF_END RhGetCurrentThunkContext, _TEXT
#endif //FEATURE_EMULATED_TLS
91 changes: 9 additions & 82 deletions src/coreclr/src/nativeaot/Runtime/arm64/PInvoke.S
Original file line number Diff line number Diff line change
Expand Up @@ -138,84 +138,6 @@ NoAbort:

NESTED_END RhpWaitForGC, _TEXT

//////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// RhpReversePInvoke
//
// IN: x9: address of reverse pinvoke frame
// 0: save slot for previous M->U transition frame
// 8: save slot for thread pointer to avoid re-calc in epilog sequence
//
// PRESERVES: x0 - x8 -- need to preserve these because the caller assumes they are not trashed
//
// TRASHES: x10, x11
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
LEAF_ENTRY RhpReversePInvoke, _TEXT

INLINE_GETTHREAD x10 // x10 = Thread
str x10, [x9, #8] // save Thread pointer for RhpReversePInvokeReturn

// x9 = reverse pinvoke frame
// x10 = thread
// x11 = scratch

ldr w11, [x10, #OFFSETOF__Thread__m_ThreadStateFlags]
tbz x11, #TSF_Attached_Bit, AttachThread

ThreadAttached:
//
// Check for the correct mode. This is accessible via various odd things that we cannot completely
// prevent such as :
// 1) Registering a reverse pinvoke entrypoint as a vectored exception handler
// 2) Performing a managed delegate invoke on a reverse pinvoke delegate.
//
ldr x11, [x10, #OFFSETOF__Thread__m_pTransitionFrame]
cbz x11, CheckBadTransition

// Save previous TransitionFrame prior to making the mode transition so that it is always valid
// whenever we might attempt to hijack this thread.
str x11, [x9]

str xzr, [x10, #OFFSETOF__Thread__m_pTransitionFrame]
dmb ish

PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 11

tbnz x11, #TrapThreadsFlags_TrapThreads_Bit, TrapThread

ret

CheckBadTransition:
// Allow 'bad transitions' in when the TSF_DoNotTriggerGc mode is set. This allows us to have
// [NativeCallable] methods that are called via the "restricted GC callouts" as well as from native,
// which is necessary because the methods are CCW vtable methods on interfaces passed to native.
ldr w11, [x10, #OFFSETOF__Thread__m_ThreadStateFlags]
tbz x11, #TSF_DoNotTriggerGc_Bit, BadTransition

// zero-out our 'previous transition frame' save slot
mov x11, #0
str x11, [x9]

// nothing more to do
ret

TrapThread:
// put the previous frame back (sets us back to preemptive mode)
ldr x11, [x9]
str x11, [x10, #OFFSETOF__Thread__m_pTransitionFrame]
dmb ish

AttachThread:
// passing address of reverse pinvoke frame in x9
b RhpReversePInvokeAttachOrTrapThread

BadTransition:
mov x0, lr // arg <- return address
b RhpReversePInvokeBadTransition

LEAF_END RhpReversePInvoke, _TEXT

//////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// RhpReversePInvokeAttachOrTrapThread -- rare path for RhpPInvoke
Expand Down Expand Up @@ -287,10 +209,15 @@ NESTED_ENTRY RhpPInvoke, _TEXT, NoHandler
str x9, [x0, #OFFSETOF__PInvokeTransitionFrame__m_Flags]

// get TLS global variable address
// r0 = GetThread()
INLINE_GETTHREAD x10
str x10, [x0, #OFFSETOF__PInvokeTransitionFrame__m_pThread]
str x0, [x10, #OFFSETOF__Thread__m_pTransitionFrame]

#ifdef FEATURE_EMULATED_TLS
GETTHREAD_ETLS_1
#else
INLINE_GETTHREAD x1
#endif

str x1, [x0, #OFFSETOF__PInvokeTransitionFrame__m_pThread]
str x0, [x1, #OFFSETOF__Thread__m_pTransitionFrame]

PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 9

Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/src/nativeaot/Runtime/threadstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,18 @@ EXTERN_C DECLSPEC_THREAD ThreadBuffer tls_CurrentThread =
0, // all other fields are initialized by zeroes
};

#ifdef FEATURE_EMULATED_TLS

// When there is no full TLS support it might be emulated by the compiler
// In this case we need a C function that allows the assembler code to ask for the correct value for the current thread

EXTERN_C ThreadBuffer* RhpGetThread()
{
return &tls_CurrentThread;
}

#endif // FEATURE_EMULATED_TLS

#endif // !DACCESS_COMPILE

#ifdef _WIN32
Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/src/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,18 @@ thread_local TlsDestructionMonitor tls_destructionMonitor;
// This thread local variable is used for delegate marshalling
DECLSPEC_THREAD intptr_t tls_thunkData;

#ifdef FEATURE_EMULATED_TLS
EXTERN_C intptr_t* RhpGetThunkData()
{
return &tls_thunkData;
}

EXTERN_C intptr_t RhGetCurrentThunkContext()
{
return tls_thunkData;
}
#endif //FEATURE_EMULATED_TLS

// Attach thread to PAL.
// It can be called multiple times for the same thread.
// It fails fast if a different thread was already registered.
Expand Down
72 changes: 68 additions & 4 deletions src/coreclr/src/nativeaot/Runtime/unix/unixasmmacrosarm64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,77 @@ C_FUNC(\Name):
add \target, \target, #:tprel_lo12_nc:\var
.endm

.macro INLINE_GETTHREAD target
INLINE_GET_TLS_VAR \target, tls_CurrentThread
.endm

.macro PREPARE_INLINE_GETTHREAD
.global tls_CurrentThread
.macro GETTHREAD_ETLS_1
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-32 // ;; Push down stack pointer and store FP and LR
str x0, [sp, #0x10]

bl RhpGetThread
mov x1, x0

ldr x0, [sp, #0x10]
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, #32
.endm

.macro INLINE_GETTHREAD target
INLINE_GET_TLS_VAR \target, tls_CurrentThread
.macro GETTHREAD_ETLS_2
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-32 // ;; Push down stack pointer and store FP and LR
stp x0, x1, [sp, #0x10]

bl RhpGetThread
mov x2, x0

ldp x0, x1, [sp, #0x10]
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, #32
.endm

.macro GETTHREAD_ETLS_3
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-48 // ;; Push down stack pointer and store FP and LR
stp x0, x1, [sp, #0x10]
str x2, [sp, #0x20]

bl RhpGetThread
mov x3, x0

ldp x0, x1, [sp, #0x10]
ldr x2, [sp, #0x20]
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, #48
.endm

.macro GETTHREAD_ETLS_5
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-64 // ;; Push down stack pointer and store FP and LR
stp x0, x1, [sp, #0x10]
stp x2, x3, [sp, #0x20]
str x4, [sp, #0x30]

bl RhpGetThread
mov x5, x0

ldp x0, x1, [sp, #0x10]
ldp x2, x3, [sp, #0x20]
ldr x4, [sp, #0x30]
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, #64
.endm

.macro GETTHUNKDATA_ETLS_9
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-96 // ;; Push down stack pointer and store FP and LR
stp x0, x1, [sp, #0x10]
stp x2, x3, [sp, #0x20]
stp x4, x5, [sp, #0x30]
stp x6, x7, [sp, #0x40]
stp x8, xip0, [sp, #0x50]

bl RhpGetThunkData
mov x9, x0

ldp x0, x1, [sp, #0x10]
ldp x2, x3, [sp, #0x20]
ldp x4, x5, [sp, #0x30]
ldp x6, x7, [sp, #0x40]
ldp x8, xip0, [sp, #0x50]
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, #96
.endm

.macro ArmInterlockedOperationBarrier
Expand Down

0 comments on commit 6d9f01c

Please sign in to comment.