Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NativeAOT] Support variable page size on Linux Arm64 #88710

Merged
merged 8 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
//
// With FEATURE_RX_THUNKS, thunks are created by allocating new virtual memory space, where the first half of
// that space is filled with thunk stubs, and gets RX permissions, and the second half is for the thunks data,
// and gets RW permissions. The thunk stubs and data blocks are not in groupped in pairs:
// and gets RW permissions. The thunk stubs and data blocks are not groupped in pairs:
VSadov marked this conversation as resolved.
Show resolved Hide resolved
// all the thunk stubs blocks are groupped at the beginning of the allocated virtual memory space, and all the
// thunk data blocks are groupped in the second half of the virtual space.
//
Expand All @@ -40,20 +40,12 @@ namespace System.Runtime
{
internal static class Constants
{
#if TARGET_ARM64 && (TARGET_OSX || TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS)
public const uint PageSize = 0x4000; // 16k
public const nuint PageSizeMask = 0x3FFF;
#else
public const uint PageSize = 0x1000; // 4k
public const nuint PageSizeMask = 0xFFF;
#endif
public const uint AllocationGranularity = 0x10000; // 64k
public const nuint AllocationGranularityMask = 0xFFFF;

public static readonly int ThunkDataSize = 2 * IntPtr.Size;
public static readonly int ThunkCodeSize = InternalCalls.RhpGetThunkSize();
public static readonly int NumThunksPerBlock = InternalCalls.RhpGetNumThunksPerBlock();
public static readonly int NumThunkBlocksPerMapping = InternalCalls.RhpGetNumThunkBlocksPerMapping();
public static readonly uint ThunkBlockSize = (uint)InternalCalls.RhpGetThunkBlockSize();
public static readonly nuint ThunkBlockSizeMask = ThunkBlockSize - 1;
}

internal class ThunksHeap
Expand Down Expand Up @@ -105,11 +97,11 @@ private unsafe ThunksHeap(IntPtr commonStubAddress)
IntPtr thunkDataBlock = InternalCalls.RhpGetThunkDataBlockAddress(thunkStubsBlock);

// Address of the first thunk data cell should be at the beginning of the thunks data block (page-aligned)
Debug.Assert(((nuint)(nint)thunkDataBlock % Constants.PageSize) == 0);
Debug.Assert(((nuint)(nint)thunkDataBlock % Constants.ThunkBlockSize) == 0);

// Update the last pointer value in the thunks data section with the value of the common stub address
*(IntPtr*)(thunkDataBlock + (int)(Constants.PageSize - IntPtr.Size)) = commonStubAddress;
Debug.Assert(*(IntPtr*)(thunkDataBlock + (int)(Constants.PageSize - IntPtr.Size)) == commonStubAddress);
*(IntPtr*)(thunkDataBlock + (int)(Constants.ThunkBlockSize - IntPtr.Size)) = commonStubAddress;
Debug.Assert(*(IntPtr*)(thunkDataBlock + (int)(Constants.ThunkBlockSize - IntPtr.Size)) == commonStubAddress);

// Set the head and end of the linked list
_nextAvailableThunkPtr = thunkDataBlock;
Expand Down Expand Up @@ -161,11 +153,11 @@ private unsafe bool ExpandHeap()
IntPtr thunkDataBlock = InternalCalls.RhpGetThunkDataBlockAddress(thunkStubsBlock);

// Address of the first thunk data cell should be at the beginning of the thunks data block (page-aligned)
Debug.Assert(((nuint)(nint)thunkDataBlock % Constants.PageSize) == 0);
Debug.Assert(((nuint)(nint)thunkDataBlock % Constants.ThunkBlockSize) == 0);

// Update the last pointer value in the thunks data section with the value of the common stub address
*(IntPtr*)(thunkDataBlock + (int)(Constants.PageSize - IntPtr.Size)) = _commonStubAddress;
Debug.Assert(*(IntPtr*)(thunkDataBlock + (int)(Constants.PageSize - IntPtr.Size)) == _commonStubAddress);
*(IntPtr*)(thunkDataBlock + (int)(Constants.ThunkBlockSize - IntPtr.Size)) = _commonStubAddress;
Debug.Assert(*(IntPtr*)(thunkDataBlock + (int)(Constants.ThunkBlockSize - IntPtr.Size)) == _commonStubAddress);

// Link the last entry in the old list to the first entry in the new list
*((IntPtr*)_lastThunkPtr) = thunkDataBlock;
Expand Down Expand Up @@ -220,7 +212,7 @@ public unsafe IntPtr AllocateThunk()
*((IntPtr*)(nextAvailableThunkPtr + IntPtr.Size)) = IntPtr.Zero;
#endif

int thunkIndex = (int)(((nuint)(nint)nextAvailableThunkPtr) - ((nuint)(nint)nextAvailableThunkPtr & ~Constants.PageSizeMask));
int thunkIndex = (int)(((nuint)(nint)nextAvailableThunkPtr) - ((nuint)(nint)nextAvailableThunkPtr & ~Constants.ThunkBlockSizeMask));
Debug.Assert((thunkIndex % Constants.ThunkDataSize) == 0);
thunkIndex /= Constants.ThunkDataSize;

Expand Down Expand Up @@ -279,7 +271,7 @@ private static IntPtr TryGetThunkDataAddress(IntPtr thunkAddress)
nuint thunkAddressValue = (nuint)(nint)ClearThumbBit(thunkAddress);

// Compute the base address of the thunk's mapping
nuint currentThunksBlockAddress = thunkAddressValue & ~Constants.PageSizeMask;
nuint currentThunksBlockAddress = thunkAddressValue & ~Constants.ThunkBlockSizeMask;

// Make sure the thunk address is valid by checking alignment
if ((thunkAddressValue - currentThunksBlockAddress) % (nuint)Constants.ThunkCodeSize != 0)
Expand Down
24 changes: 7 additions & 17 deletions src/coreclr/nativeaot/Runtime/CommonMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,42 +132,32 @@ inline bool IS_ALIGNED(T* val, uintptr_t alignment);
#endif

#ifndef __GCENV_BASE_INCLUDED__

#if defined(HOST_WASM)
#define OS_PAGE_SIZE 0x4
#else
#define OS_PAGE_SIZE PalOsPageSize()
#endif

#if defined(HOST_AMD64)

#define DATA_ALIGNMENT 8
#define OS_PAGE_SIZE 0x1000

#elif defined(HOST_X86)

#define DATA_ALIGNMENT 4
#ifndef OS_PAGE_SIZE
#define OS_PAGE_SIZE 0x1000
#endif

#elif defined(HOST_ARM)

#define DATA_ALIGNMENT 4
#ifndef OS_PAGE_SIZE
#define OS_PAGE_SIZE 0x1000
#endif

#elif defined(HOST_ARM64)

#define DATA_ALIGNMENT 8
#ifndef OS_PAGE_SIZE
#ifdef HOST_APPLE
#define OS_PAGE_SIZE 0x4000
#else
#define OS_PAGE_SIZE 0x1000
#endif
#endif

#elif defined(HOST_WASM)

#define DATA_ALIGNMENT 4
#ifndef OS_PAGE_SIZE
#define OS_PAGE_SIZE 0x4
#endif

#else
#error Unsupported target architecture
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/nativeaot/Runtime/PalRedhawk.h
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,7 @@ struct UNIX_CONTEXT;
#endif

#ifdef TARGET_UNIX
REDHAWK_PALIMPORT uint32_t REDHAWK_PALAPI PalGetOsPageSize();
REDHAWK_PALIMPORT void REDHAWK_PALAPI PalSetHardwareExceptionHandler(PHARDWARE_EXCEPTION_HANDLER handler);
#else
REDHAWK_PALIMPORT void* REDHAWK_PALAPI PalAddVectoredExceptionHandler(uint32_t firstHandler, _In_ PVECTORED_EXCEPTION_HANDLER vectoredHandler);
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/nativeaot/Runtime/ThunksMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@

static_assert((THUNK_SIZE % 4) == 0, "Thunk stubs size not aligned correctly. This will cause runtime failures.");

#define THUNKS_MAP_SIZE 0x8000 // 32 K
// 32 K or OS page
#define THUNKS_MAP_SIZE (max(0x8000, OS_PAGE_SIZE))

#ifdef TARGET_ARM
//*****************************************************************************
Expand Down Expand Up @@ -56,7 +57,7 @@ void EncodeThumb2Mov32(uint16_t * pCode, uint32_t value, uint8_t rDestination)

COOP_PINVOKE_HELPER(int, RhpGetNumThunkBlocksPerMapping, ())
{
static_assert((THUNKS_MAP_SIZE % OS_PAGE_SIZE) == 0, "Thunks map size should be in multiples of pages");
ASSERT_MSG((THUNKS_MAP_SIZE % OS_PAGE_SIZE) == 0, "Thunks map size should be in multiples of pages");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, 64kB is a common page size on Linux Arm64. It is going to assert there.

Ask @janvorli if you need help getting Linux arm64 machine configured like that for testing.

Copy link
Member Author

@VSadov VSadov Jul 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with #define THUNKS_MAP_SIZE (max(0x8000, OS_PAGE_SIZE)) would it assert?

It would still be interesting to run this on an actual system with 64K pages.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. I got it backwards.


return THUNKS_MAP_SIZE / OS_PAGE_SIZE;
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/nativeaot/Runtime/allocheap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ uint8_t * AllocHeap::_Alloc(
#endif // FEATURE_RWX_MEMORY

ASSERT((alignment & (alignment - 1)) == 0); // Power of 2 only.
ASSERT(alignment <= OS_PAGE_SIZE); // Can't handle this right now.
ASSERT((int32_t)alignment <= OS_PAGE_SIZE); // Can't handle this right now.
ASSERT((m_rwProtectType == m_roProtectType) == (pRWAccessHolder == NULL));
ASSERT(!_UseAccessManager() || pRWAccessHolder != NULL);

Expand Down Expand Up @@ -276,7 +276,7 @@ bool AllocHeap::_UpdateMemPtrs(uint8_t* pNextFree)
//-------------------------------------------------------------------------------------------------
bool AllocHeap::_AllocNewBlock(uintptr_t cbMem)
{
cbMem = ALIGN_UP(max(cbMem, s_minBlockSize), OS_PAGE_SIZE);;
cbMem = ALIGN_UP(cbMem, OS_PAGE_SIZE);

uint8_t * pbMem = reinterpret_cast<uint8_t*>
(PalVirtualAlloc(NULL, cbMem, MEM_COMMIT, m_roProtectType));
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/nativeaot/Runtime/allocheap.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ class AllocHeap
bool _UpdateMemPtrs(uint8_t* pNextFree);
bool _UseAccessManager() { return m_rwProtectType != m_roProtectType; }

static const uintptr_t s_minBlockSize = OS_PAGE_SIZE;

typedef rh::util::MemRange Block;
typedef DPTR(Block) PTR_Block;
struct BlockListElem : public Block
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/nativeaot/Runtime/amd64/MiscStubs.S
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//
// See also https://github.com/dotnet/runtime/issues/9899 for more information.

#define PAGE_SIZE 0x1000
#define PROBE_STEP 0x1000

NESTED_ENTRY RhpStackProbe, _TEXT, NoHandler
// On entry:
Expand All @@ -32,11 +32,11 @@ NESTED_ENTRY RhpStackProbe, _TEXT, NoHandler

END_PROLOGUE

and rsp, -PAGE_SIZE // rsp points to the **lowest address** on the last probed page
and rsp, -PROBE_STEP // rsp points to the **lowest address** on the last probed page
// This is done to make the following loop end condition simpler.

LOCAL_LABEL(ProbeLoop):
sub rsp, PAGE_SIZE // rsp points to the lowest address of the **next page** to probe
sub rsp, PROBE_STEP // rsp points to the lowest address of the **next page** to probe
test dword ptr [rsp], eax // rsp points to the lowest address on the **last probed** page
cmp rsp, r11
jg LOCAL_LABEL(ProbeLoop) // if (rsp > r11), then we need to probe at least one more page.
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/nativeaot/Runtime/amd64/MiscStubs.asm
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include AsmMacros.inc
;
; NOTE: this helper will NOT modify a value of rsp and can be defined as a leaf function.

PAGE_SIZE equ 1000h
PROBE_STEP equ 1000h

LEAF_ENTRY RhpStackProbe, _TEXT
; On entry:
Expand All @@ -24,11 +24,11 @@ LEAF_ENTRY RhpStackProbe, _TEXT
; NOTE: this helper will probe at least one page below the one pointed by rsp.

mov rax, rsp ; rax points to some byte on the last probed page
and rax, -PAGE_SIZE ; rax points to the **lowest address** on the last probed page
and rax, -PROBE_STEP ; rax points to the **lowest address** on the last probed page
; This is done to make the following loop end condition simpler.

ProbeLoop:
sub rax, PAGE_SIZE ; rax points to the lowest address of the **next page** to probe
sub rax, PROBE_STEP ; rax points to the lowest address of the **next page** to probe
test dword ptr [rax], eax ; rax points to the lowest address on the **last probed** page
cmp rax, r11
jg ProbeLoop ; If (rax > r11), then we need to probe at least one more page.
Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/nativeaot/Runtime/arm/MiscStubs.S
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,20 @@
// r5 - is not preserved
//
// NOTE: this helper will probe at least one page below the one pointed to by sp.
#define PROBE_PAGE_SIZE 4096
#define PROBE_PAGE_SIZE_LOG2 12
#define PROBE_STEP 4096
#define PROBE_STEP_LOG2 12

LEAF_ENTRY RhpStackProbe, _TEXT
PROLOG_PUSH "{r7}"
PROLOG_STACK_SAVE r7

mov r5, sp // r5 points to some byte on the last probed page
bfc r5, #0, #PROBE_PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page
bfc r5, #0, #PROBE_STEP_LOG2 // r5 points to the **lowest address** on the last probed page
mov sp, r5

ProbeLoop:
// Immediate operand for the following instruction can not be greater than 4095.
sub sp, #(PROBE_PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe
sub sp, #(PROBE_STEP - 4) // sp points to the **fourth** byte on the **next page** to probe
ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page
cmp sp, r4
bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page.
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/nativeaot/Runtime/i386/MiscStubs.S
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// NOTE: this helper will modify a value of esp and must establish the frame pointer.
// NOTE: On Linux we must advance the stack pointer as we probe - it is not allowed to access 65535 bytes below esp.
//
#define PAGE_SIZE 0x1000
#define PROBE_STEP 0x1000
NESTED_ENTRY RhpStackProbe, _TEXT, NoHandler
// On entry:
// eax - the lowest address of the stack frame being allocated (i.e. [InitialSp - FrameSize])
Expand All @@ -25,11 +25,11 @@ NESTED_ENTRY RhpStackProbe, _TEXT, NoHandler
PROLOG_BEG
PROLOG_END

and esp, -PAGE_SIZE // esp points to the **lowest address** on the last probed page
and esp, -PROBE_STEP // esp points to the **lowest address** on the last probed page
// This is done to make the loop end condition simpler.

LOCAL_LABEL(ProbeLoop):
sub esp, PAGE_SIZE // esp points to the lowest address of the **next page** to probe
sub esp, PROBE_STEP // esp points to the lowest address of the **next page** to probe
test [esp], eax // esp points to the lowest address on the **last probed** page
cmp esp, eax
jg LOCAL_LABEL(ProbeLoop) // if esp > eax, then we need to probe at least one more page.
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/nativeaot/Runtime/i386/MiscStubs.asm
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ include AsmMacros.inc
; The call to the helper will be emitted by JIT in the function prolog when large (larger than 0x3000 bytes) stack frame is required.
;
; NOTE: this helper will modify a value of esp and must establish the frame pointer.
PAGE_SIZE equ 1000h
PROBE_STEP equ 1000h

_RhpStackProbe PROC public
; On entry:
Expand All @@ -25,10 +25,10 @@ _RhpStackProbe PROC public
push ebp
mov ebp, esp

and esp, -PAGE_SIZE ; esp points to the **lowest address** on the last probed page
and esp, -PROBE_STEP ; esp points to the **lowest address** on the last probed page
; This is done to make the loop end condition simpler.
ProbeLoop:
sub esp, PAGE_SIZE ; esp points to the lowest address of the **next page** to probe
sub esp, PROBE_STEP ; esp points to the lowest address of the **next page** to probe
test [esp], eax ; esp points to the lowest address on the **last probed** page
cmp esp, eax
jg ProbeLoop ; if esp > eax, then we need to probe at least one more page.
Expand Down
14 changes: 14 additions & 0 deletions src/coreclr/nativeaot/Runtime/unix/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,17 @@ FORCEINLINE void PalSetLastError(int32_t error)
{
errno = error;
}

FORCEINLINE int32_t PalOsPageSize()
{
#if defined(HOST_AMD64)
// all supported platforms use 4K pages on x64, including emulated environments
return 0x1000;
#elif defined(HOST_APPLE)
// OSX and related OS expose 16-kilobyte pages to the 64-bit userspace
// https://developer.apple.com/library/archive/documentation/Performance/Conceptual/ManagingMemory/Articles/AboutMemory.html
return 0x4000;
#else
return PalGetOsPageSize();
#endif
}
23 changes: 21 additions & 2 deletions src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,24 @@ void InitializeCurrentProcessCpuCount()
g_RhNumberOfProcessors = count;
}

static uint32_t g_RhPageSize;

void InitializeOsPageSize()
{
g_RhPageSize = (uint32_t)sysconf(_SC_PAGE_SIZE);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps better:

#ifdef OS_PAGE_SIZE
    g_RhPageSize = (uint32_t)OS_PAGE_SIZE;
#else
    g_RhPageSize = (uint32_t)sysconf(_SC_PAGE_SIZE);
#endif

, delete PalOsPageSize and use PalGetOsPageSize everywhere?

nit: Os -> OS (two-letter abbreviation rule)

Copy link
Member Author

@VSadov VSadov Jul 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete PalOsPageSize and use PalGetOsPageSize everywhere?

I think we need one that inlines, and one that does not. (and one calls another, if needed)
I did not replace uses of OS_PAGE_SIZE, though, as that would be a lot of changes and keeping OS_PAGE_SIZE would match closer the similar pattern in GC code.


#if defined(HOST_AMD64)
ASSERT(g_RhPageSize == 0x1000);
#elif defined(HOST_APPLE)
ASSERT(g_RhPageSize == 0x4000);
#endif
}

REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI PalGetOsPageSize()
{
return g_RhPageSize;
}

#if defined(TARGET_LINUX) || defined(TARGET_ANDROID)
static pthread_key_t key;
#endif
Expand Down Expand Up @@ -412,6 +430,8 @@ REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalInit()

InitializeCurrentProcessCpuCount();

InitializeOsPageSize();

#if defined(TARGET_LINUX) || defined(TARGET_ANDROID)
if (pthread_key_create(&key, RuntimeThreadShutdown) != 0)
{
Expand Down Expand Up @@ -872,13 +892,12 @@ REDHAWK_PALEXPORT void PalFlushInstructionCache(_In_ void* pAddress, size_t size
//
// As a workaround, we call __builtin___clear_cache on each page separately.

const size_t pageSize = getpagesize();
uint8_t* begin = (uint8_t*)pAddress;
uint8_t* end = begin + size;

while (begin < end)
{
uint8_t* endOrNextPageBegin = ALIGN_UP(begin + 1, pageSize);
uint8_t* endOrNextPageBegin = ALIGN_UP(begin + 1, OS_PAGE_SIZE);
if (endOrNextPageBegin > end)
endOrNextPageBegin = end;

Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,8 @@ FORCEINLINE void PalYieldProcessor()
#endif

#define PalDebugBreak() __debugbreak()

FORCEINLINE int32_t PalOsPageSize()
{
return 0x1000;
}