Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the handling of SVE state as part of threadsuspend #105059

Merged
merged 4 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 80 additions & 10 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,32 @@ EXTERN_C VOID __cdecl RtlRestoreContextFallback(PCONTEXT ContextRecord, struct _
typedef BOOL(WINAPI* PINITIALIZECONTEXT2)(PVOID Buffer, DWORD ContextFlags, PCONTEXT* Context, PDWORD ContextLength, ULONG64 XStateCompactionMask);
PINITIALIZECONTEXT2 pfnInitializeContext2 = NULL;

#ifdef TARGET_ARM64
// Mirror the XSTATE_ARM64_SVE flags from winnt.h

#ifndef XSTATE_ARM64_SVE
#define XSTATE_ARM64_SVE (2)
#endif // XSTATE_ARM64_SVE

#ifndef XSTATE_MASK_ARM64_SVE
#define XSTATE_MASK_ARM64_SVE (1ui64 << (XSTATE_ARM64_SVE))
#endif // XSTATE_MASK_ARM64_SVE

#ifndef CONTEXT_ARM64_XSTATE
#define CONTEXT_ARM64_XSTATE (CONTEXT_ARM64 | 0x20L)
#endif // CONTEXT_ARM64_XSTATE

#ifndef CONTEXT_XSTATE
#define CONTEXT_XSTATE CONTEXT_ARM64_XSTATE
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows doesn't define CONTEXT_XSTATE for arm64, it only defines CONTEXT_ARM64_XSTATE. So this is primarily to avoid a lot of other ifdefs elsewhere in the file.

#endif // CONTEXT_XSTATE

typedef DWORD64(WINAPI* PGETENABLEDXSTATEFEATURES)();
PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures = NULL;

typedef BOOL(WINAPI* PSETXSTATEFEATURESMASK)(PCONTEXT Context, DWORD64 FeatureMask);
PSETXSTATEFEATURESMASK pfnSetXStateFeaturesMask = NULL;
#endif // TARGET_ARM64

#ifdef TARGET_X86
EXTERN_C VOID __cdecl RtlRestoreContextFallback(PCONTEXT ContextRecord, struct _EXCEPTION_RECORD* ExceptionRecord);
typedef VOID(__cdecl* PRTLRESTORECONTEXT)(PCONTEXT ContextRecord, struct _EXCEPTION_RECORD* ExceptionRecord);
Expand All @@ -478,7 +504,7 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
{
CONTEXT* pOSContext = NULL;

#if (defined(TARGET_X86) || defined(TARGET_AMD64))
#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_ARM64)
DWORD context = CONTEXT_COMPLETE;

if (pfnInitializeContext2 == NULL)
Expand All @@ -490,6 +516,17 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
}
}

#if defined(TARGET_ARM64)
if (pfnGetEnabledXStateFeatures == NULL)
{
HMODULE hm = GetModuleHandleW(_T("kernel32.dll"));
if (hm != NULL)
{
pfnGetEnabledXStateFeatures = (PGETENABLEDXSTATEFEATURES)GetProcAddress(hm, "GetEnabledXStateFeatures");
}
}
#endif // TARGET_ARM64

#ifdef TARGET_X86
if (pfnRtlRestoreContext == NULL)
{
Expand All @@ -503,10 +540,27 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
}
#endif //TARGET_X86

// Determine if the processor supports AVX or AVX512 so we could
// retrieve extended registers
DWORD64 FeatureMask = GetEnabledXStateFeatures();
if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0)
#if defined(TARGET_X86) || defined(TARGET_AMD64)
const DWORD64 xStateFeatureMask = XSTATE_MASK_AVX | XSTATE_MASK_AVX512;
const ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_MPX | xStateFeatureMask;
#elif defined(TARGET_ARM64)
const DWORD64 xStateFeatureMask = XSTATE_MASK_ARM64_SVE;
const ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | xStateFeatureMask;
#endif

// Determine if the processor supports extended features so we could retrieve those registers
DWORD64 FeatureMask = 0;

#if defined(TARGET_X86) || defined(TARGET_AMD64)
FeatureMask = GetEnabledXStateFeatures();
#elif defined(TARGET_ARM64)
if (pfnGetEnabledXStateFeatures != NULL)
{
FeatureMask = pfnGetEnabledXStateFeatures();
}
#endif

if ((FeatureMask & xStateFeatureMask) != 0)
{
context = context | CONTEXT_XSTATE;
}
Expand All @@ -517,7 +571,6 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB

// Retrieve contextSize by passing NULL for Buffer
DWORD contextSize = 0;
ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512;
// The initialize call should fail but return contextSize
BOOL success = pfnInitializeContext2 ?
pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) :
Expand Down Expand Up @@ -565,15 +618,32 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont
{
_ASSERTE((pCtx->ContextFlags & CONTEXT_COMPLETE) == CONTEXT_COMPLETE);

#if defined(TARGET_X86) || defined(TARGET_AMD64)
// Make sure that AVX feature mask is set, if supported. This should not normally fail.
#if defined(TARGET_ARM64)
if (pfnSetXStateFeaturesMask == NULL)
{
HMODULE hm = GetModuleHandleW(_T("kernel32.dll"));
if (hm != NULL)
{
pfnSetXStateFeaturesMask = (PSETXSTATEFEATURESMASK)GetProcAddress(hm, "SetXStateFeaturesMask");
}
}
#endif // TARGET_ARM64

// This should not normally fail.
// The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor.
#if defined(TARGET_X86) || defined(TARGET_AMD64)
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512))
{
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512");
return FALSE;
}
#endif //defined(TARGET_X86) || defined(TARGET_AMD64)
#elif defined(TARGET_ARM64)
if ((pfnSetXStateFeaturesMask != NULL) && !pfnSetXStateFeaturesMask(pCtx, XSTATE_MASK_ARM64_SVE))
{
_ASSERTE(!"Could not apply XSTATE_MASK_ARM64_SVE");
return FALSE;
}
#endif

return GetThreadContext(hThread, pCtx);
}
Expand Down Expand Up @@ -902,7 +972,7 @@ REDHAWK_PALEXPORT HANDLE PalLoadLibrary(const char* moduleName)
return 0;
}
moduleNameWide[len] = '\0';

HANDLE result = LoadLibraryExW(moduleNameWide, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
delete[] moduleNameWide;
return result;
Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/pal/inc/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1844,11 +1844,11 @@ typedef struct _IMAGE_ARM_RUNTIME_FUNCTION_ENTRY {
#define CONTEXT_EXCEPTION_REQUEST 0x40000000L
#define CONTEXT_EXCEPTION_REPORTING 0x80000000L

#define CONTEXT_XSTATE (CONTEXT_ARM64 | 0x40L)
#define CONTEXT_ARM64_XSTATE (CONTEXT_ARM64 | 0x20L)
#define CONTEXT_XSTATE CONTEXT_ARM64_XSTATE

#define XSTATE_SVE (0)

#define XSTATE_MASK_SVE (UI64(1) << (XSTATE_SVE))
#define XSTATE_ARM64_SVE (2)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there might need to be an additional set of changes in vm/arm64/asmconstants.h as there is currently a SIZEOF__CONTEXT struct that exists and which is meant to represent the size of the struct on Win32 vs UNIX and I believe it is currently incorrect for Unix (or may need additional handling to ensure its large enough to hold the additional SVE state based on the vector length).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That shouldn't be related to the Windows handling being added, however, so I've left it to a future PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how it could be incorrect when there is an assert that it matches the sizeof (T_CONTEXT) in the asmconstants.h.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The consideration is that SVE registers don’t have a fixed width and so the current logic asserting the size statically is already wrong (it’s assuming SVE registers are exactly 128-bits, when they can be up to 2048 bits)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, but the constant is to represent the size of the CONTEXT structure and that's fixed for now, so it is correct. I believe that when we'll be adding support for longer SVE registers, we should finally move to the Windows way of separate context and extended context, so the SIZEOF__CONTEXT would still be constant - and it will actually match Windows one after that.

#define XSTATE_MASK_ARM64_SVE (UI64(1) << (XSTATE_ARM64_SVE))

//
// This flag is set by the unwinder if it has unwound to a call
Expand Down
11 changes: 6 additions & 5 deletions src/coreclr/pal/src/arch/arm64/asmconstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@

#define CONTEXT_FULL (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT)

#define CONTEXT_XSTATE_BIT (6)
#define CONTEXT_XSTATE (1 << CONTEXT_XSTATE_BIT)
#define CONTEXT_ARM64_XSTATE_BIT (5)
#define CONTEXT_ARM64_XSTATE (1 << CONTEXT_XSTATE_BIT)

#define XSTATE_SVE_BIT (0)

#define XSTATE_MASK_SVE (UI64(1) << (XSTATE_SVE))
#define CONTEXT_XSTATE_BIT CONTEXT_ARM64_XSTATE_BIT
#define CONTEXT_XSTATE CONTEXT_ARM64_XSTATE

#define XSTATE_ARM64_SVE_BIT (2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The general CONTEXT struct as is currently used by PAL is supposed to mirror the win32 layout/defines so they can translate 1-to-1

The win32 naming convention here is then XSTATE_ARM64_SVE and it is using bit 2, not bit 1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The win32 naming convention here is then XSTATE_ARM64_SVE and it is using bit 2, not bit 1

So wrong bit was set for linux in #103801 as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right.

I don't think it would technically cause problems because we're just emulating the CONTEXT struct and it doesn't actually have to match the Win32 struct 1-to-1 (since its only going to be other pal APIs consuming it)

But it's overall better/simpler to ensure they match so we don't end up with potential conflicts or other issues.

The best option here would be to just consume the native context_t struct from Unix directly (and not do any of this shimming), but that's a more involved PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't actually have to match the Win32 struct 1-to-1

In fact, it does not match the Win32 struct 1-to-1: #103801 (comment)

The best option here would be to just consume the native context_t struct from Unix directly (and not do any of this shimming), but that's a more involved PR.

+1. The Windows CONTEXT structure is very entrenched throughout CoreCLR VM.

#define XSTATE_MASK_ARM64_SVE (UI64(1) << (XSTATE_ARM64_SVE_BIT))

#define CONTEXT_ContextFlags 0
#define CONTEXT_Cpsr CONTEXT_ContextFlags+4
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/pal/src/arch/arm64/context2.S
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ LOCAL_LABEL(Done_CONTEXT_FLOATING_POINT):
b.ne LOCAL_LABEL(Done_CONTEXT_SVE)

ldr x1, [x0, CONTEXT_XSTATEFEATURESMASK_OFFSET]
tbz x1, #XSTATE_SVE_BIT, LOCAL_LABEL(Done_CONTEXT_SVE)
tbz x1, #XSTATE_ARM64_SVE_BIT, LOCAL_LABEL(Done_CONTEXT_SVE)

add x0, x0, CONTEXT_SVE_OFFSET
str p0, [x0, CONTEXT_P0_VL, MUL VL]
Expand Down Expand Up @@ -195,7 +195,7 @@ LOCAL_LABEL(Restore_CONTEXT_FLOATING_POINT):
tbz w17, #CONTEXT_XSTATE_BIT, LOCAL_LABEL(No_Restore_CONTEXT_SVE)

ldr w17, [x16, CONTEXT_XSTATEFEATURESMASK_OFFSET]
tbz w17, #XSTATE_SVE_BIT, LOCAL_LABEL(No_Restore_CONTEXT_SVE)
tbz w17, #XSTATE_ARM64_SVE_BIT, LOCAL_LABEL(No_Restore_CONTEXT_SVE)

add x16, x16, CONTEXT_SVE_OFFSET
ldr p0, [x16, CONTEXT_FFR_VL, MUL VL]
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/pal/src/exception/machexception.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ HijackFaultingThread(

void **targetSP = (void **)threadContext.Rsp;
#elif defined(HOST_ARM64)
threadContext.ContextFlags = CONTEXT_FLOATING_POINT;
threadContext.ContextFlags = CONTEXT_FLOATING_POINT | CONTEX_XSTATE;
CONTEXT_GetThreadContextFromThreadState(ARM_NEON_STATE64, (thread_state_t)&exceptionInfo.FloatState, &threadContext);

threadContext.ContextFlags |= CONTEXT_CONTROL | CONTEXT_INTEGER;
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/pal/src/thread/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,7 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native)
//TODO-SVE: This only handles vector lengths of 128bits.
if (CONTEXT_GetSveLengthFromOS() == 16)
{
_ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_SVE) == XSTATE_MASK_SVE);
_ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_ARM64_SVE) == XSTATE_MASK_ARM64_SVE);

uint16_t vq = sve_vq_from_vl(lpContext->Vl);

Expand Down Expand Up @@ -1169,7 +1169,7 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex

uint16_t vq = sve_vq_from_vl(sve->vl);

lpContext->XStateFeaturesMask |= XSTATE_MASK_SVE;
lpContext->XStateFeaturesMask |= XSTATE_MASK_ARM64_SVE;

//Note: Size of ffr register is SVE_SIG_FFR_SIZE(vq) bytes.
lpContext->Ffr = *(WORD*) (((uint8_t*)sve) + SVE_SIG_FFR_OFFSET(vq));
Expand Down
17 changes: 17 additions & 0 deletions src/coreclr/vm/ceemain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,17 @@ BOOL g_singleVersionHosting;
typedef BOOL(WINAPI* PINITIALIZECONTEXT2)(PVOID Buffer, DWORD ContextFlags, PCONTEXT* Context, PDWORD ContextLength, ULONG64 XStateCompactionMask);
PINITIALIZECONTEXT2 g_pfnInitializeContext2 = NULL;

#ifdef TARGET_ARM64
typedef DWORD64(WINAPI* PGETENABLEDXSTATEFEATURES)();
PGETENABLEDXSTATEFEATURES g_pfnGetEnabledXStateFeatures = NULL;

typedef BOOL(WINAPI* PGETXSTATEFEATURESMASK)(PCONTEXT Context, PDWORD64 FeatureMask);
PGETXSTATEFEATURESMASK g_pfnGetXStateFeaturesMask = NULL;

typedef BOOL(WINAPI* PSETXSTATEFEATURESMASK)(PCONTEXT Context, DWORD64 FeatureMask);
PSETXSTATEFEATURESMASK g_pfnSetXStateFeaturesMask = NULL;
#endif // TARGET_ARM64

static BOOLEAN WINAPI RtlDllShutdownInProgressFallback()
{
return g_fProcessDetach;
Expand All @@ -412,6 +423,12 @@ void InitializeOptionalWindowsAPIPointers()
HMODULE hm = GetModuleHandleW(_T("kernel32.dll"));
g_pfnInitializeContext2 = (PINITIALIZECONTEXT2)GetProcAddress(hm, "InitializeContext2");

#ifdef TARGET_ARM64
g_pfnGetEnabledXStateFeatures = (PGETENABLEDXSTATEFEATURES)GetProcAddress(hm, "GetEnabledXStateFeatures");
g_pfnGetXStateFeaturesMask = (PGETXSTATEFEATURESMASK)GetProcAddress(hm, "GetXStateFeaturesMask");
g_pfnSetXStateFeaturesMask = (PSETXSTATEFEATURESMASK)GetProcAddress(hm, "SetXStateFeaturesMask");
#endif // TARGET_ARM64

hm = GetModuleHandleW(_T("ntdll.dll"));
PRTLDLLSHUTDOWNINPROGRESS pfn = (PRTLDLLSHUTDOWNINPROGRESS)GetProcAddress(hm, "RtlDllShutdownInProgress");
if (pfn != NULL)
Expand Down
Loading
Loading