Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating Unix to save/restore Avx512 state #83784

Merged
merged 17 commits into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/coreclr/gc/vxsort/isa_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ SupportedISA DetermineSupportedISA()
// get xcr0 register
DWORD64 xcr0 = _xgetbv(0);

// get OS XState info
// get OS XState info
DWORD64 FeatureMask = GetEnabledXStateFeaturesHelper();

// get processor extended feature flag info
__cpuid(reg, 7);
__cpuidex(reg, 7, 0);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved

// check if all of AVX2, AVX512F and AVX512DQ are supported by both processor and OS
if ((reg[EBX] & (AVX2 | AVX512F | AVX512DQ)) == (AVX2 | AVX512F | AVX512DQ) &&
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/nativeaot/Runtime/PalRedhawk.h
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,9 @@ REDHAWK_PALIMPORT int32_t __cdecl _stricmp(const char *string1, const char *stri
#ifdef TARGET_UNIX
// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures
// We define matching signatures for use on Unix platforms.
//
// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid

REDHAWK_PALIMPORT void __cpuid(int cpuInfo[4], int function_id);
REDHAWK_PALIMPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id);
#else
Expand Down
29 changes: 26 additions & 3 deletions src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ static void ActivationHandler(int code, siginfo_t* siginfo, void* context)
#endif
))
{
// Make sure that errno is not modified
// Make sure that errno is not modified
int savedErrNo = errno;
g_pHijackCallback((NATIVE_CONTEXT*)context, NULL);
errno = savedErrNo;
Expand Down Expand Up @@ -1275,12 +1275,16 @@ extern "C" uint64_t PalGetCurrentThreadIdForLogging()
}

#if defined(HOST_X86) || defined(HOST_AMD64)
// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures
// We define matching signatures for use on Unix platforms.
//
// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid

#if !__has_builtin(__cpuid)
REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id)
{
// Based on the Clang implementation provided in cpuid.h:
// https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h
// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h

__asm(" cpuid\n" \
: "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \
Expand All @@ -1293,7 +1297,7 @@ REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id)
REDHAWK_PALEXPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id)
{
// Based on the Clang implementation provided in cpuid.h:
// https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h
// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h

__asm(" cpuid\n" \
: "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \
Expand All @@ -1314,8 +1318,26 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI xmmYmmStateSupport()
return ((eax & 0x06) == 0x06) ? 1 : 0;
}

#ifndef XSTATE_MASK_AVX512
#define XSTATE_MASK_AVX512 (0xE0) /* 0b1110_0000 */
#endif // XSTATE_MASK_AVX512

REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport()
{
#if defined(TARGET_APPLE)
// MacOS has specialized behavior where it reports AVX512 support but doesnt
// actually enable AVX512 until the first instruction is executed and does so
// on a per thread basis. It does this by catching the faulting instruction and
// checking for the EVEX encoding. The kmov instructions, despite being part
// of the AVX512 instruction set are VEX encoded and dont trigger the enablement
//
// See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174

// TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger
// the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems

return false;
#else
DWORD eax;
__asm(" xgetbv\n" \
: "=a"(eax) /*output in eax*/\
Expand All @@ -1324,6 +1346,7 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport()
);
// check OS has enabled XMM, YMM and ZMM state support
return ((eax & 0xE6) == 0x0E6) ? 1 : 0;
#endif
}

#endif // defined(HOST_X86) || defined(HOST_AMD64)
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,17 +365,17 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
}
#endif //TARGET_X86

// Determine if the processor supports AVX so we could
// Determine if the processor supports AVX or AVX512 so we could
// retrieve extended registers
DWORD64 FeatureMask = GetEnabledXStateFeatures();
if ((FeatureMask & XSTATE_MASK_AVX) != 0)
if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0)
{
context = context | CONTEXT_XSTATE;
}

// Retrieve contextSize by passing NULL for Buffer
DWORD contextSize = 0;
ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX;
ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512;
// The initialize call should fail but return contextSize
BOOL success = pfnInitializeContext2 ?
pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) :
Expand Down Expand Up @@ -426,9 +426,9 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont
#if defined(TARGET_X86) || defined(TARGET_AMD64)
// Make sure that AVX feature mask is set, if supported. This should not normally fail.
// The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor.
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX))
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512))
{
_ASSERTE(!"Could not apply XSTATE_MASK_AVX");
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512");
return FALSE;
}
#endif //defined(TARGET_X86) || defined(TARGET_AMD64)
Expand Down
136 changes: 136 additions & 0 deletions src/coreclr/pal/inc/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1315,6 +1315,43 @@ QueueUserAPC(
IN HANDLE hThread,
IN ULONG_PTR dwData);

#if defined(HOST_X86) || defined(HOST_AMD64)
// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures
// We define matching signatures for use on Unix platforms.
//
// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid

#if __has_builtin(__cpuid)
janvorli marked this conversation as resolved.
Show resolved Hide resolved
extern "C" void __cpuid(int cpuInfo[4], int function_id);
#else
inline void __cpuid(int cpuInfo[4], int function_id)
{
// Based on the Clang implementation provided in cpuid.h:
// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h

__asm(" cpuid\n" \
: "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \
: "0"(function_id)
);
}
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
#endif // __cpuid

#if __has_builtin(__cpuidex)
extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id);
#else
inline void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id)
{
// Based on the Clang implementation provided in cpuid.h:
// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h

__asm(" cpuid\n" \
: "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \
: "0"(function_id), "2"(subFunction_id)
);
}
#endif // __cpuidex
#endif // HOST_X86 || HOST_AMD64

#ifdef HOST_X86

//
Expand Down Expand Up @@ -1461,6 +1498,7 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS {
//

#elif defined(HOST_AMD64)

// copied from winnt.h

#define CONTEXT_AMD64 0x100000
Expand All @@ -1482,11 +1520,33 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS {
#define CONTEXT_EXCEPTION_REQUEST 0x40000000
#define CONTEXT_EXCEPTION_REPORTING 0x80000000

#define XSTATE_GSSE (2)
#define XSTATE_AVX (XSTATE_GSSE)
#define XSTATE_AVX512_KMASK (5)
#define XSTATE_AVX512_ZMM_H (6)
#define XSTATE_AVX512_ZMM (7)

#define XSTATE_MASK_GSSE (UI64(1) << (XSTATE_GSSE))
#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE)
#define XSTATE_MASK_AVX512 ((UI64(1) << (XSTATE_AVX512_KMASK)) | \
(UI64(1) << (XSTATE_AVX512_ZMM_H)) | \
(UI64(1) << (XSTATE_AVX512_ZMM)))

typedef struct DECLSPEC_ALIGN(16) _M128A {
ULONGLONG Low;
LONGLONG High;
} M128A, *PM128A;

typedef struct DECLSPEC_ALIGN(16) _M256 {
M128A Low;
M128A High;
} M256, *PM256;

typedef struct DECLSPEC_ALIGN(16) _M512 {
M256 Low;
M256 High;
} M512, *PM512;

typedef struct _XMM_SAVE_AREA32 {
WORD ControlWord;
WORD StatusWord;
Expand Down Expand Up @@ -1623,6 +1683,82 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
DWORD64 LastBranchFromRip;
DWORD64 LastExceptionToRip;
DWORD64 LastExceptionFromRip;

// XSTATE
DWORD64 XStateFeaturesMask;
DWORD64 XStateReserved0;

// XSTATE_AVX
struct {
M128A Ymm0H;
M128A Ymm1H;
M128A Ymm2H;
M128A Ymm3H;
M128A Ymm4H;
M128A Ymm5H;
M128A Ymm6H;
M128A Ymm7H;
M128A Ymm8H;
M128A Ymm9H;
M128A Ymm10H;
M128A Ymm11H;
M128A Ymm12H;
M128A Ymm13H;
M128A Ymm14H;
M128A Ymm15H;
};

// XSTATE_AVX512_KMASK
struct {
DWORD64 KMask0;
DWORD64 KMask1;
DWORD64 KMask2;
DWORD64 KMask3;
DWORD64 KMask4;
DWORD64 KMask5;
DWORD64 KMask6;
DWORD64 KMask7;
};

// XSTATE_AVX512_ZMM_H
struct {
M256 Zmm0H;
M256 Zmm1H;
M256 Zmm2H;
M256 Zmm3H;
M256 Zmm4H;
M256 Zmm5H;
M256 Zmm6H;
M256 Zmm7H;
M256 Zmm8H;
M256 Zmm9H;
M256 Zmm10H;
M256 Zmm11H;
M256 Zmm12H;
M256 Zmm13H;
M256 Zmm14H;
M256 Zmm15H;
};

// XSTATE_AVX512_ZMM
struct {
M512 Zmm16;
M512 Zmm17;
M512 Zmm18;
M512 Zmm19;
M512 Zmm20;
M512 Zmm21;
M512 Zmm22;
M512 Zmm23;
M512 Zmm24;
M512 Zmm25;
M512 Zmm26;
M512 Zmm27;
M512 Zmm28;
M512 Zmm29;
M512 Zmm30;
M512 Zmm31;
};
} CONTEXT, *PCONTEXT, *LPCONTEXT;

//
Expand Down
28 changes: 23 additions & 5 deletions src/coreclr/pal/src/arch/amd64/asmconstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@

#ifdef HOST_64BIT

#define XSTATE_GSSE (2)
#define XSTATE_AVX (XSTATE_GSSE)
#define XSTATE_AVX512_KMASK (5)
#define XSTATE_AVX512_ZMM_H (6)
#define XSTATE_AVX512_ZMM (7)

#define XSTATE_MASK_GSSE (1 << (XSTATE_GSSE))
#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE)
#define XSTATE_MASK_AVX512 ((1 << (XSTATE_AVX512_KMASK)) | \
(1 << (XSTATE_AVX512_ZMM_H)) | \
(1 << (XSTATE_AVX512_ZMM)))

// The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not
// relevant, the arch bit is excluded from the flag constants below for simpler tests.
#define CONTEXT_AMD64 0x100000
Expand All @@ -17,7 +29,7 @@

#define CONTEXT_XSTATE 64

#define CONTEXT_ContextFlags 6*8
#define CONTEXT_ContextFlags (6*8)
#define CONTEXT_SegCs CONTEXT_ContextFlags+8
#define CONTEXT_SegDs CONTEXT_SegCs+2
#define CONTEXT_SegEs CONTEXT_SegDs+2
Expand Down Expand Up @@ -49,8 +61,8 @@
#define CONTEXT_R15 CONTEXT_R14+8
#define CONTEXT_Rip CONTEXT_R15+8
#define CONTEXT_FltSave CONTEXT_Rip+8
#define FLOATING_SAVE_AREA_SIZE 4*8+24*16+96
#define CONTEXT_Xmm0 CONTEXT_FltSave+10*16
#define FLOATING_SAVE_AREA_SIZE (4*8)+(24*16)+96
#define CONTEXT_Xmm0 CONTEXT_FltSave+(10*16)
#define CONTEXT_Xmm1 CONTEXT_Xmm0+16
#define CONTEXT_Xmm2 CONTEXT_Xmm1+16
#define CONTEXT_Xmm3 CONTEXT_Xmm2+16
Expand All @@ -67,13 +79,19 @@
#define CONTEXT_Xmm14 CONTEXT_Xmm13+16
#define CONTEXT_Xmm15 CONTEXT_Xmm14+16
#define CONTEXT_VectorRegister CONTEXT_FltSave+FLOATING_SAVE_AREA_SIZE
#define CONTEXT_VectorControl CONTEXT_VectorRegister+16*26
#define CONTEXT_VectorControl CONTEXT_VectorRegister+(16*26)
#define CONTEXT_DebugControl CONTEXT_VectorControl+8
#define CONTEXT_LastBranchToRip CONTEXT_DebugControl+8
#define CONTEXT_LastBranchFromRip CONTEXT_LastBranchToRip+8
#define CONTEXT_LastExceptionToRip CONTEXT_LastBranchFromRip+8
#define CONTEXT_LastExceptionFromRip CONTEXT_LastExceptionToRip+8
#define CONTEXT_Size CONTEXT_LastExceptionFromRip+8
#define CONTEXT_XStateFeaturesMask CONTEXT_LastExceptionFromRip+8
#define CONTEXT_XStateReserved0 CONTEXT_XStateFeaturesMask+8
#define CONTEXT_Ymm0H CONTEXT_XStateReserved0+8
#define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16)
#define CONTEXT_Zmm0H CONTEXT_KMask0+(8*8)
#define CONTEXT_Zmm16 CONTEXT_Zmm0H+(32*16)
#define CONTEXT_Size CONTEXT_Zmm16+(64*16)

#else // HOST_64BIT

Expand Down
Loading