From aa6e6b23cc29f6e40c0e8201d6de0423ef0dbc83 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 09:23:46 -0700 Subject: [PATCH 01/16] Updating Unix to save/restore Avx512 state --- .../Runtime/windows/PalRedhawkMinWin.cpp | 10 +- src/coreclr/pal/inc/pal.h | 113 +++++++++++ src/coreclr/pal/src/arch/amd64/asmconstants.h | 22 +- src/coreclr/pal/src/arch/amd64/context2.S | 84 ++++++-- src/coreclr/pal/src/include/pal/context.h | 190 ++++++++++++++++-- src/coreclr/pal/src/thread/context.cpp | 145 ++++++++++++- src/coreclr/vm/amd64/asmconstants.h | 4 +- src/coreclr/vm/amd64/unixstubs.cpp | 26 --- src/coreclr/vm/cgensys.h | 7 - src/coreclr/vm/i386/cgenx86.cpp | 26 --- 10 files changed, 514 insertions(+), 113 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp index 1d2b9766b4fdc0..b110c8f38accec 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp @@ -365,17 +365,17 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB } #endif //TARGET_X86 - // Determine if the processor supports AVX so we could + // Determine if the processor supports AVX or AVX512 so we could // retrieve extended registers DWORD64 FeatureMask = GetEnabledXStateFeatures(); - if ((FeatureMask & XSTATE_MASK_AVX) != 0) + if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0) { context = context | CONTEXT_XSTATE; } // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = pfnInitializeContext2 ? pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : @@ -426,9 +426,9 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont #if defined(TARGET_X86) || defined(TARGET_AMD64) // Make sure that AVX feature mask is set, if supported. This should not normally fail. // The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor. - if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX)) + if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) { - _ASSERTE(!"Could not apply XSTATE_MASK_AVX"); + _ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512"); return FALSE; } #endif //defined(TARGET_X86) || defined(TARGET_AMD64) diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index 88dcd4e8d77f7a..d89734c12bcf1f 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -1317,6 +1317,12 @@ QueueUserAPC( #ifdef HOST_X86 +// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures +// We define matching signatures for use on Unix platforms. + +extern "C" void __cpuid(int cpuInfo[4], int function_id); +extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); + // // *********************************************************************************** // @@ -1461,6 +1467,13 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS { // #elif defined(HOST_AMD64) + +// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures +// We define matching signatures for use on Unix platforms. + +extern "C" void __cpuid(int cpuInfo[4], int function_id); +extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); + // copied from winnt.h #define CONTEXT_AMD64 0x100000 @@ -1482,11 +1495,33 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS { #define CONTEXT_EXCEPTION_REQUEST 0x40000000 #define CONTEXT_EXCEPTION_REPORTING 0x80000000 +#define XSTATE_GSSE (2) +#define XSTATE_AVX (XSTATE_GSSE) +#define XSTATE_AVX512_KMASK (5) +#define XSTATE_AVX512_ZMM_H (6) +#define XSTATE_AVX512_ZMM (7) + +#define XSTATE_MASK_GSSE (1ui64 << (XSTATE_GSSE)) +#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE) +#define XSTATE_MASK_AVX512 ((1ui64 << (XSTATE_AVX512_KMASK)) | \ + (1ui64 << (XSTATE_AVX512_ZMM_H)) | \ + (1ui64 << (XSTATE_AVX512_ZMM))) + typedef struct DECLSPEC_ALIGN(16) _M128A { ULONGLONG Low; LONGLONG High; } M128A, *PM128A; +typedef struct DECLSPEC_ALIGN(32) _M256A { + M128A Low; + M128A High; +} M256A, *PM256A; + +typedef struct DECLSPEC_ALIGN(64) _M512A { + M256A Low; + M256A High; +} M512A, *PM512A; + typedef struct _XMM_SAVE_AREA32 { WORD ControlWord; WORD StatusWord; @@ -1623,6 +1658,84 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT { DWORD64 LastBranchFromRip; DWORD64 LastExceptionToRip; DWORD64 LastExceptionFromRip; + + // XSTATE + DWORD64 XStateFeaturesMask; + DWORD64 XStateReserved0; + + // XSTATE_AVX + struct { + M128A Ymm0H; + M128A Ymm1H; + M128A Ymm2H; + M128A Ymm3H; + M128A Ymm4H; + M128A Ymm5H; + M128A Ymm6H; + M128A Ymm7H; + M128A Ymm8H; + M128A Ymm9H; + M128A Ymm10H; + M128A Ymm11H; + M128A Ymm12H; + M128A Ymm13H; + M128A Ymm14H; + M128A Ymm15H; + }; + + // XSTATE_AVX512_KMASK + struct { + DWORD64 KMask0; + DWORD64 KMask1; + DWORD64 KMask2; + DWORD64 KMask3; + DWORD64 KMask4; + DWORD64 KMask5; + DWORD64 KMask6; + DWORD64 KMask7; + }; + + // XSTATE_AVX512_ZMM_H + struct { + M256A Zmm0H; + M256A Zmm1H; + M256A Zmm2H; + M256A Zmm3H; + M256A Zmm4H; + M256A Zmm5H; + M256A Zmm6H; + M256A Zmm7H; + M256A Zmm8H; + M256A Zmm9H; + M256A Zmm10H; + M256A Zmm11H; + M256A Zmm12H; + M256A Zmm13H; + M256A Zmm14H; + M256A Zmm15H; + }; + + DWORD64 XStateReserved1[4]; + + // XSTATE_AVX512_ZMM + struct { + M512A Zmm16; + M512A Zmm17; + M512A Zmm18; + M512A Zmm19; + M512A Zmm20; + M512A Zmm21; + M512A Zmm22; + M512A Zmm23; + M512A Zmm24; + M512A Zmm25; + M512A Zmm26; + M512A Zmm27; + M512A Zmm28; + M512A Zmm29; + M512A Zmm30; + M512A Zmm31; + }; } CONTEXT, *PCONTEXT, *LPCONTEXT; // diff --git a/src/coreclr/pal/src/arch/amd64/asmconstants.h b/src/coreclr/pal/src/arch/amd64/asmconstants.h index c23fb7043c77d5..b40d26ce6c9b2c 100644 --- a/src/coreclr/pal/src/arch/amd64/asmconstants.h +++ b/src/coreclr/pal/src/arch/amd64/asmconstants.h @@ -3,6 +3,11 @@ #ifdef HOST_64BIT +#define XFEATURE_MASK_OPMASK (1 << 5) +#define XFEATURE_MASK_ZMM_Hi256 (1 << 6) +#define XFEATURE_MASK_Hi16_ZMM (1 << 7) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + // The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not // relevant, the arch bit is excluded from the flag constants below for simpler tests. #define CONTEXT_AMD64 0x100000 @@ -17,7 +22,7 @@ #define CONTEXT_XSTATE 64 -#define CONTEXT_ContextFlags 6*8 +#define CONTEXT_ContextFlags (6*8) #define CONTEXT_SegCs CONTEXT_ContextFlags+8 #define CONTEXT_SegDs CONTEXT_SegCs+2 #define CONTEXT_SegEs CONTEXT_SegDs+2 @@ -49,8 +54,8 @@ #define CONTEXT_R15 CONTEXT_R14+8 #define CONTEXT_Rip CONTEXT_R15+8 #define CONTEXT_FltSave CONTEXT_Rip+8 -#define FLOATING_SAVE_AREA_SIZE 4*8+24*16+96 -#define CONTEXT_Xmm0 CONTEXT_FltSave+10*16 +#define FLOATING_SAVE_AREA_SIZE (4*8)+(24*16)+96 +#define CONTEXT_Xmm0 CONTEXT_FltSave+(10*16) #define CONTEXT_Xmm1 CONTEXT_Xmm0+16 #define CONTEXT_Xmm2 CONTEXT_Xmm1+16 #define CONTEXT_Xmm3 CONTEXT_Xmm2+16 @@ -67,13 +72,20 @@ #define CONTEXT_Xmm14 CONTEXT_Xmm13+16 #define CONTEXT_Xmm15 CONTEXT_Xmm14+16 #define CONTEXT_VectorRegister CONTEXT_FltSave+FLOATING_SAVE_AREA_SIZE -#define CONTEXT_VectorControl CONTEXT_VectorRegister+16*26 +#define CONTEXT_VectorControl CONTEXT_VectorRegister+(16*26) #define CONTEXT_DebugControl CONTEXT_VectorControl+8 #define CONTEXT_LastBranchToRip CONTEXT_DebugControl+8 #define CONTEXT_LastBranchFromRip CONTEXT_LastBranchToRip+8 #define CONTEXT_LastExceptionToRip CONTEXT_LastBranchFromRip+8 #define CONTEXT_LastExceptionFromRip CONTEXT_LastExceptionToRip+8 -#define CONTEXT_Size CONTEXT_LastExceptionFromRip+8 +#define CONTEXT_XStateFeaturesMask CONTEXT_LastExceptionFromRip+8 +#define CONTEXT_XStateReserved0 CONTEXT_XStateFeaturesMask+8 +#define CONTEXT_Ymm0H CONTEXT_XStateReserved0+8 +#define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16) +#define CONTEXT_Zmm0H CONTEXT_Ymm0H+(8*8) +#define CONTEXT_XStateReserved1 CONTEXT_Zmm0H+(32*16) +#define CONTEXT_Zmm16 CONTEXT_XStateReserved1+(8*4) +#define CONTEXT_Size CONTEXT_Zmm16+(64*16) #else // HOST_64BIT diff --git a/src/coreclr/pal/src/arch/amd64/context2.S b/src/coreclr/pal/src/arch/amd64/context2.S index c8688dd63c0946..a35b7365fc78f3 100644 --- a/src/coreclr/pal/src/arch/amd64/context2.S +++ b/src/coreclr/pal/src/arch/amd64/context2.S @@ -107,23 +107,73 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT): test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_XSTATE je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) - // Restore the extended state (for now, this is just the upper halves of YMM registers) - vinsertf128 ymm0, ymm0, xmmword ptr [rdi + (CONTEXT_VectorRegister + 0 * 16)], 1 - vinsertf128 ymm1, ymm1, xmmword ptr [rdi + (CONTEXT_VectorRegister + 1 * 16)], 1 - vinsertf128 ymm2, ymm2, xmmword ptr [rdi + (CONTEXT_VectorRegister + 2 * 16)], 1 - vinsertf128 ymm3, ymm3, xmmword ptr [rdi + (CONTEXT_VectorRegister + 3 * 16)], 1 - vinsertf128 ymm4, ymm4, xmmword ptr [rdi + (CONTEXT_VectorRegister + 4 * 16)], 1 - vinsertf128 ymm5, ymm5, xmmword ptr [rdi + (CONTEXT_VectorRegister + 5 * 16)], 1 - vinsertf128 ymm6, ymm6, xmmword ptr [rdi + (CONTEXT_VectorRegister + 6 * 16)], 1 - vinsertf128 ymm7, ymm7, xmmword ptr [rdi + (CONTEXT_VectorRegister + 7 * 16)], 1 - vinsertf128 ymm8, ymm8, xmmword ptr [rdi + (CONTEXT_VectorRegister + 8 * 16)], 1 - vinsertf128 ymm9, ymm9, xmmword ptr [rdi + (CONTEXT_VectorRegister + 9 * 16)], 1 - vinsertf128 ymm10, ymm10, xmmword ptr [rdi + (CONTEXT_VectorRegister + 10 * 16)], 1 - vinsertf128 ymm11, ymm11, xmmword ptr [rdi + (CONTEXT_VectorRegister + 11 * 16)], 1 - vinsertf128 ymm12, ymm12, xmmword ptr [rdi + (CONTEXT_VectorRegister + 12 * 16)], 1 - vinsertf128 ymm13, ymm13, xmmword ptr [rdi + (CONTEXT_VectorRegister + 13 * 16)], 1 - vinsertf128 ymm14, ymm14, xmmword ptr [rdi + (CONTEXT_VectorRegister + 14 * 16)], 1 - vinsertf128 ymm15, ymm15, xmmword ptr [rdi + (CONTEXT_VectorRegister + 15 * 16)], 1 + // Restore the YMM state + vinsertf128 ymm0, ymm0, xmmword ptr [rdi + (CONTEXT_Ymm0H + 0 * 16)], 1 + vinsertf128 ymm1, ymm1, xmmword ptr [rdi + (CONTEXT_Ymm0H + 1 * 16)], 1 + vinsertf128 ymm2, ymm2, xmmword ptr [rdi + (CONTEXT_Ymm0H + 2 * 16)], 1 + vinsertf128 ymm3, ymm3, xmmword ptr [rdi + (CONTEXT_Ymm0H + 3 * 16)], 1 + vinsertf128 ymm4, ymm4, xmmword ptr [rdi + (CONTEXT_Ymm0H + 4 * 16)], 1 + vinsertf128 ymm5, ymm5, xmmword ptr [rdi + (CONTEXT_Ymm0H + 5 * 16)], 1 + vinsertf128 ymm6, ymm6, xmmword ptr [rdi + (CONTEXT_Ymm0H + 6 * 16)], 1 + vinsertf128 ymm7, ymm7, xmmword ptr [rdi + (CONTEXT_Ymm0H + 7 * 16)], 1 + vinsertf128 ymm8, ymm8, xmmword ptr [rdi + (CONTEXT_Ymm0H + 8 * 16)], 1 + vinsertf128 ymm9, ymm9, xmmword ptr [rdi + (CONTEXT_Ymm0H + 9 * 16)], 1 + vinsertf128 ymm10, ymm10, xmmword ptr [rdi + (CONTEXT_Ymm0H + 10 * 16)], 1 + vinsertf128 ymm11, ymm11, xmmword ptr [rdi + (CONTEXT_Ymm0H + 11 * 16)], 1 + vinsertf128 ymm12, ymm12, xmmword ptr [rdi + (CONTEXT_Ymm0H + 12 * 16)], 1 + vinsertf128 ymm13, ymm13, xmmword ptr [rdi + (CONTEXT_Ymm0H + 13 * 16)], 1 + vinsertf128 ymm14, ymm14, xmmword ptr [rdi + (CONTEXT_Ymm0H + 14 * 16)], 1 + vinsertf128 ymm15, ymm15, xmmword ptr [rdi + (CONTEXT_Ymm0H + 15 * 16)], 1 + + test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XFEATURE_MASK_AVX512 + je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) + + // Restore the Opmask state + kmovq k0, qword ptr [rdi + (CONTEXT_KMask0 + 0 * 8)] + kmovq k1, qword ptr [rdi + (CONTEXT_KMask0 + 1 * 8)] + kmovq k2, qword ptr [rdi + (CONTEXT_KMask0 + 2 * 8)] + kmovq k3, qword ptr [rdi + (CONTEXT_KMask0 + 3 * 8)] + kmovq k4, qword ptr [rdi + (CONTEXT_KMask0 + 4 * 8)] + kmovq k5, qword ptr [rdi + (CONTEXT_KMask0 + 5 * 8)] + kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)] + kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)] + + // Restore the ZMM_Hi256 state + vinsertf64x4 zmm0, zmm0, ymmword ptr [rdi + (CONTEXT_Zmm0H + 0 * 32)], 1 + vinsertf64x4 zmm1, zmm1, ymmword ptr [rdi + (CONTEXT_Zmm0H + 1 * 32)], 1 + vinsertf64x4 zmm2, zmm2, ymmword ptr [rdi + (CONTEXT_Zmm0H + 2 * 32)], 1 + vinsertf64x4 zmm3, zmm3, ymmword ptr [rdi + (CONTEXT_Zmm0H + 3 * 32)], 1 + vinsertf64x4 zmm4, zmm4, ymmword ptr [rdi + (CONTEXT_Zmm0H + 4 * 32)], 1 + vinsertf64x4 zmm5, zmm5, ymmword ptr [rdi + (CONTEXT_Zmm0H + 5 * 32)], 1 + vinsertf64x4 zmm6, zmm6, ymmword ptr [rdi + (CONTEXT_Zmm0H + 6 * 32)], 1 + vinsertf64x4 zmm7, zmm7, ymmword ptr [rdi + (CONTEXT_Zmm0H + 7 * 32)], 1 + vinsertf64x4 zmm8, zmm8, ymmword ptr [rdi + (CONTEXT_Zmm0H + 8 * 32)], 1 + vinsertf64x4 zmm9, zmm9, ymmword ptr [rdi + (CONTEXT_Zmm0H + 9 * 32)], 1 + vinsertf64x4 zmm10, zmm10, ymmword ptr [rdi + (CONTEXT_Zmm0H + 10 * 32)], 1 + vinsertf64x4 zmm11, zmm11, ymmword ptr [rdi + (CONTEXT_Zmm0H + 11 * 32)], 1 + vinsertf64x4 zmm12, zmm12, ymmword ptr [rdi + (CONTEXT_Zmm0H + 12 * 32)], 1 + vinsertf64x4 zmm13, zmm13, ymmword ptr [rdi + (CONTEXT_Zmm0H + 13 * 32)], 1 + vinsertf64x4 zmm14, zmm14, ymmword ptr [rdi + (CONTEXT_Zmm0H + 14 * 32)], 1 + vinsertf64x4 zmm15, zmm15, ymmword ptr [rdi + (CONTEXT_Zmm0H + 15 * 32)], 1 + + // Restore the Hi16_ZMM state + vmovups zmm16, zmmword ptr [rdi + (CONTEXT_Zmm16 + 0 * 64)] + vmovups zmm17, zmmword ptr [rdi + (CONTEXT_Zmm16 + 1 * 64)] + vmovups zmm18, zmmword ptr [rdi + (CONTEXT_Zmm16 + 2 * 64)] + vmovups zmm19, zmmword ptr [rdi + (CONTEXT_Zmm16 + 3 * 64)] + vmovups zmm20, zmmword ptr [rdi + (CONTEXT_Zmm16 + 4 * 64)] + vmovups zmm21, zmmword ptr [rdi + (CONTEXT_Zmm16 + 5 * 64)] + vmovups zmm22, zmmword ptr [rdi + (CONTEXT_Zmm16 + 6 * 64)] + vmovups zmm23, zmmword ptr [rdi + (CONTEXT_Zmm16 + 7 * 64)] + vmovups zmm24, zmmword ptr [rdi + (CONTEXT_Zmm16 + 8 * 64)] + vmovups zmm25, zmmword ptr [rdi + (CONTEXT_Zmm16 + 9 * 64)] + vmovups zmm26, zmmword ptr [rdi + (CONTEXT_Zmm16 + 10 * 64)] + vmovups zmm27, zmmword ptr [rdi + (CONTEXT_Zmm16 + 11 * 64)] + vmovups zmm28, zmmword ptr [rdi + (CONTEXT_Zmm16 + 12 * 64)] + vmovups zmm29, zmmword ptr [rdi + (CONTEXT_Zmm16 + 13 * 64)] + vmovups zmm30, zmmword ptr [rdi + (CONTEXT_Zmm16 + 14 * 64)] + vmovups zmm31, zmmword ptr [rdi + (CONTEXT_Zmm16 + 15 * 64)] + LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE): test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_CONTROL diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index c702ae272a76bd..0e0346c2ebfd05 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -354,13 +354,57 @@ using asm_sigcontext::_xstate; #define FPSTATE_RESERVED padding #endif -// The mask for YMM registers presence flag stored in the xfeatures (formerly xstate_bv). On current Linuxes, this definition is -// only in internal headers, so we define it here. The xfeatures (formerly xstate_bv) is extracted from the processor xstate bit -// vector register, so the value is OS independent. -#ifndef XSTATE_YMM -#define XSTATE_YMM 4 +// Presence for various extended state registers is detected via the xfeatures (formerly xstate_bv) field. On some +// Linux distros, this definition is only in internal headers, so we define it here. The masks are extracted from +// the processor xstate bit vector register, so the value is OS independent. + +#ifndef XFEATURE_MASK_YMM +#define XFEATURE_MASK_YMM (1 << 2) +#endif // XFEATURE_MASK_YMM + +#ifndef XFEATURE_MASK_OPMASK +#define XFEATURE_MASK_OPMASK (1 << 5) +#endif // XFEATURE_MASK_OPMASK + +#ifndef XFEATURE_MASK_ZMM_Hi256 +#define XFEATURE_MASK_ZMM_Hi256 (1 << 6) +#endif // XFEATURE_MASK_ZMM_Hi256 + +#ifndef XFEATURE_MASK_Hi16_ZMM +#define XFEATURE_MASK_Hi16_ZMM (1 << 7) +#endif // XFEATURE_MASK_Hi16_ZMM + +#ifndef XFEATURE_MASK_AVX512 +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) +#endif // XFEATURE_MASK_AVX512 + +#if HAVE__FPX_SW_BYTES_WITH_XSTATE_BV +#define FPREG_FpxSwBytes_xfeatures(uc) FPREG_FpxSwBytes(uc)->xstate_bv +#else +#define FPREG_FpxSwBytes_xfeatures(uc) FPREG_FpxSwBytes(uc)->xfeatures #endif +// The internal _xstate struct is exposed as fpstate, xstate_hdr, ymmh. However, in reality this is +// fpstate, xstate_hdr, extended_state_area and "technically" we are supposed to be determining the +// offset and size of each XFEATURE_MASK_* via CPUID. The extended region always starts at offset +// 576 which is the same as the address of ymmh + +#define FPREG_Xstate_ExtendedStateArea_Offset offsetof(_xstate, ymmh) +#define FPREG_Xstate_ExtendedStateArea(uc) (reinterpret_cast(FPREG_Fpstate(uc)) + \ + FPREG_Xstate_ExtendedStateArea_Offset) + +struct Xstate_ExtendedFeature +{ + bool initialized; + uint32_t offset; + uint32_t size; +}; + +// XFEATURE_Hi16_ZMM is currently the largest we need and is index 7 +#define Xstate_ExtendedFeatures_Count (7) + +extern Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; + inline _fpx_sw_bytes *FPREG_FpxSwBytes(const ucontext_t *uc) { // Bytes 464..511 in the FXSAVE format are available for software to use for any purpose. In this case, they are used to @@ -378,7 +422,7 @@ inline UINT32 FPREG_ExtendedSize(const ucontext_t *uc) return FPREG_FpxSwBytes(uc)->extended_size; } -inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) +inline bool FPREG_HasExtendedState(const ucontext_t *uc) { // See comments in /usr/include/x86_64-linux-gnu/asm/sigcontext.h for info on how to detect if extended state is present static_assert_no_msg(FP_XSTATE_MAGIC2_SIZE == sizeof(UINT32)); @@ -401,21 +445,100 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) return false; } -#if HAVE__FPX_SW_BYTES_WITH_XSTATE_BV - return (FPREG_FpxSwBytes(uc)->xstate_bv & XSTATE_YMM) != 0; -#else - return (FPREG_FpxSwBytes(uc)->xfeatures & XSTATE_YMM) != 0; -#endif + return true; +} + +inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) +{ + if (!FPREG_HasExtendedState(uc)) + { + return false; + } + + return (FPREG_FpxSwBytes_xfeatures(uc) & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM; } -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc) +inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *sz, uint32_t featureIndex) { - static_assert_no_msg(sizeof(reinterpret_cast<_xstate *>(FPREG_Fpstate(uc))->ymmh.ymmh_space) == 16 * 16); + _ASSERTE(sz != nullptr); + _ASSERTE(featureIndex < (sizeof(Xstate_ExtendedFeatures) / sizeof(Xstate_ExtendedFeature))); + _ASSERT(FPREG_Xstate_ExtendedStateArea_Offset == 576); + + Xstate_ExtendedFeature *extendedFeature = &Xstate_ExtendedFeatures[featureIndex]; + + if (!extendedFeature->initialized) + { + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + +#ifdef _DEBUG + // We should only be calling this function if we know the extended feature exists + + __cpuid(cpuidInfo, 0x00000000); + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); + + __cpuid(cpuidInfo, 0x0000000D); + _ASSERTE((cpuidInfo[CPUID_EAX] & featureIndex) != 0); +#endif // _DEBUG + + __cpuidex(cpuidInfo, 0x0000000D, static_cast(featureIndex)); + + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) > 0); + _ASSERTE(static_cast(cpuidInfo[CPUID_EBX]) > FPREG_Xstate_ExtendedStateArea_Offset); + + extendedFeature->size = static_cast(cpuidInfo[CPUID_EAX]); + extendedFeature->offset = static_cast(cpuidInfo[CPUID_EBX] - FPREG_Xstate_ExtendedStateArea_Offset); + + extendedFeature->initialized = true; + } + + *sz = extendedFeature->size; + return (FPREG_Xstate_ExtendedStateArea(uc) + extendedFeature->offset); +} + +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasYmmRegisters(uc)); + return FPREG_Xstate_ExtendedFeature(uc, sz, 2); +} + +inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) +{ + if (!FPREG_HasExtendedState(uc)) + { + return false; + } + + if ((FPREG_FpxSwBytes_xfeatures(uc) & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) + { + return false; + } + _ASSERTE(FPREG_HasYmmRegisters(uc)); + return true; +} - return reinterpret_cast<_xstate *>(FPREG_Fpstate(uc))->ymmh.ymmh_space; +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, sz, 5); +} + +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, sz, 6); } +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + return FPREG_Xstate_ExtendedFeature(uc, sz, 7); +} #endif // XSTATE_SUPPORTED ///////////////////// @@ -706,11 +829,48 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) } static_assert_no_msg(offsetof(_STRUCT_X86_AVX_STATE64, __fpu_ymmh0) == offsetof(_STRUCT_X86_AVX512_STATE64, __fpu_ymmh0)); -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc) + +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *sz) { + _ASSERTE(FPREG_HasYmmRegisters(uc)); + _ASSERTE(sz != nullptr); + + *sz = sizeof(_STRUCT_XMM_REG) * 16; return reinterpret_cast(&((_STRUCT_X86_AVX_STATE64&)FPSTATE(uc)).__fpu_ymmh0); } +inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) +{ + _ASSERTE(uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); + return (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); +} + +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(sz != nullptr); + + *sz = sizeof(_STRUCT_OPMASK_REG) * 8; + return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_k0); +} + +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(sz != nullptr); + + *sz = sizeof(_STRUCT_YMM_REG) * 16; + return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_zmmh0); +} + +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) +{ + _ASSERTE(FPREG_HasAvx512Registers(uc)); + _ASSERTE(sz != nullptr); + + *sz = sizeof(_STRUCT_ZMM_REG) * 16; + return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_zmm16); +} #else //TARGET_OSX // For FreeBSD, as found in x86/ucontext.h diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index a17c6c077da3b7..68436ab4289a3d 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -318,6 +318,34 @@ typedef int __ptrace_request; #if !HAVE_MACH_EXCEPTIONS +static Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; + +#if !__has_builtin(__cpuid) +void __cpuid(int cpuInfo[4], int function_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id) + ); +} +#endif + +#if !__has_builtin(__cpuidex) +void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id), "2"(subFunction_id) + ); +} +#endif + /*++ Function: CONTEXT_GetRegisters @@ -682,8 +710,34 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) #if defined(HOST_AMD64) && defined(XSTATE_SUPPORTED) if ((lpContext->ContextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE) { - _ASSERTE(FPREG_HasYmmRegisters(native)); - memcpy_s(FPREG_Xstate_Ymmh(native), sizeof(M128A) * 16, lpContext->VectorRegister, sizeof(M128A) * 16); + if (FPREG_HasYmmRegisters(native)) + { + _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + + uint32_t size; + void *dest; + + dest = FPREG_Xstate_Ymmh(native, &size); + _ASSERT(size == (sizeof(M128A) * 16)); + memcpy_s(dest, sizeof(M128A) * 16, &lpContext->Ymm0H, sizeof(M128A) * 16); + + if (FPREG_HasAvx512Registers(native)) + { + _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512); + + dest = FPREG_Xstate_Opmask(native, &size); + _ASSERT(size == (sizeof(DWORD64) * 8)); + memcpy_s(dest, sizeof(DWORD64) * 8, &lpContext->KMask0, sizeof(DWORD64) * 8); + + dest = FPREG_Xstate_ZmmHi256(native, &size); + _ASSERT(size == (sizeof(M256A) * 16)); + memcpy_s(dest, sizeof(M256A) * 16, &lpContext->Zmm0H, sizeof(M256A) * 16); + + dest = FPREG_Xstate_Hi16Zmm(native, &size); + _ASSERT(size == (sizeof(M512A) * 16)); + memcpy_s(dest, sizeof(M512A) * 16, &lpContext->Zmm16, sizeof(M512A) * 16); + } + } } #endif //HOST_AMD64 && XSTATE_SUPPORTED } @@ -853,7 +907,31 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex #if XSTATE_SUPPORTED if (FPREG_HasYmmRegisters(native)) { - memcpy_s(lpContext->VectorRegister, sizeof(M128A) * 16, FPREG_Xstate_Ymmh(native), sizeof(M128A) * 16); + uint32_t size; + void *src; + + src = FPREG_Xstate_Ymmh(native, &size); + _ASSERT(size == (sizeof(M128A) * 16)); + memcpy_s(&lpContext->Ymm0H, sizeof(M128A) * 16, src, sizeof(M128A) * 16); + + lpContext->XStateFeaturesMask |= XFEATURE_MASK_YMM; + + if (FPREG_HasAvx512Registers(native)) + { + src = FPREG_Xstate_Opmask(native, &size); + _ASSERT(size == (sizeof(DWORD64) * 8)); + memcpy_s(&lpContext->KMask0, sizeof(DWORD64) * 8, src, sizeof(DWORD64) * 8); + + src = FPREG_Xstate_ZmmHi256(native, &size); + _ASSERT(size == (sizeof(M256A) * 16)); + memcpy_s(&lpContext->Zmm0H, sizeof(M256A) * 16, src, sizeof(M256A) * 16); + + src = FPREG_Xstate_Hi16Zmm(native, &size); + _ASSERT(size == (sizeof(M512A) * 16)); + memcpy_s(&lpContext->Zmm16, sizeof(M512A) * 16, src, sizeof(M512A) * 16); + + lpContext->XStateFeaturesMask |= XFEATURE_MASK_AVX512; + } } else #endif // XSTATE_SUPPORTED @@ -1306,18 +1384,38 @@ CONTEXT_GetThreadContextFromThreadState( } break; - case x86_AVX_STATE64: case x86_AVX512_STATE64: + { + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) + { + x86_avx512_state64_t *pState = (x86_avx512_state64_t *)threadState; + + memcpy_s(&lpContext->KMask0, sizeof(opmask_reg) * 8, &pState->__fpu_k0, sizeof(opmask_reg) * 8); + memcpy_s(&lpContext->Zmm0H, sizeof(ymm_reg) * 16, &pState->__fpu_zmmh0, sizeof(ymm_reg) * 16); + memcpy_s(&lpContext->Zmm16, sizeof(zmm_reg) * 16, &pState->__fpu_zmm16, sizeof(zmm_reg) * 16); + + lpContext->XStateFeaturesMask |= XFEATURE_MASK_AVX512; + } + + // Intentional fall-through, the AVX512 states are supersets of the AVX state + FALLTHROUGH; + } + + case x86_AVX_STATE64: + { if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState; - memcpy(&lpContext->VectorRegister, &pState->__fpu_ymmh0, 16 * 16); + memcpy_s(&lpContext->Ymm0H, sizeof(xmm_reg) * 16, &pState->__fpu_ymmh0, sizeof(xmm_reg) * 16); + lpContext->XStateFeaturesMask |= XFEATURE_MASK_YMM; } // Intentional fall-through, the AVX states are supersets of the FLOAT state FALLTHROUGH; + } case x86_FLOAT_STATE64: + { if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK) { x86_float_state64_t *pState = (x86_float_state64_t *)threadState; @@ -1343,6 +1441,8 @@ CONTEXT_GetThreadContextFromThreadState( memcpy(&lpContext->Xmm0, &pState->__fpu_xmm0, 16 * 16); } break; + } + case x86_THREAD_STATE: { x86_thread_state_t *pState = (x86_thread_state_t *)threadState; @@ -1457,17 +1557,30 @@ CONTEXT_SetThreadContextOnPort( #ifdef HOST_AMD64 #ifdef XSTATE_SUPPORTED // We're relying on the fact that the initial portion of - // x86_avx_state64_t is identical to x86_float_state64_t. + // x86_avx_state64_t is identical to x86_float_state64_t + // and x86_avx512_state64_t to _x86_avx_state64_t. // Check a few fields to make sure the assumption is correct. static_assert_no_msg(sizeof(x86_avx_state64_t) > sizeof(x86_float_state64_t)); + static_assert_no_msg(sizeof(x86_avx512_state64_t) > sizeof(x86_avx_state64_t)); static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw)); static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0)); + static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw)); + static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0)); - x86_avx_state64_t State; + x86_avx512_state64_t State; if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - StateFlavor = x86_AVX_STATE64; - StateCount = sizeof(State) / sizeof(natural_t); + if ((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512) + { + StateFlavor = x86_AVX512_STATE64; + StateCount = sizeof(State) / sizeof(natural_t); + } + else + { + _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); + } } else { @@ -1520,7 +1633,19 @@ CONTEXT_SetThreadContextOnPort( #if defined(HOST_AMD64) && defined(XSTATE_SUPPORTED) if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - memcpy(&State.__fpu_ymmh0, lpContext->VectorRegister, 16 * 16); + if ((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512) + { + memcpy_s(&State.__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8, lpContext->KMask0, + sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy_s(&State.__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16, lpContext->Zmm0H, + sizeof(_STRUCT_YMM_REG) * 16); + memcpy_s(&State.__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16, lpContext->Zmm16, + sizeof(_STRUCT_ZMM_REG) * 16); + } + + _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + memcpy_s(&State.__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16, lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); + } #endif diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index cbd0eae0c79df4..a3d3d4447f2115 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -285,7 +285,7 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__MethodDesc__m_wFlags == offsetof(MethodDesc, m_w ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub == offsetof(VASigCookie, pNDirectILStub)); -#define SIZEOF__CONTEXT (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) +#define SIZEOF__CONTEXT (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) ASMCONSTANTS_C_ASSERT(SIZEOF__CONTEXT == sizeof(CONTEXT)); @@ -437,7 +437,7 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__Xmm15 ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__VectorRegister == offsetof(CONTEXT, VectorRegister[0])); -#define SIZEOF__FaultingExceptionFrame (0x20 + SIZEOF__CONTEXT) +#define SIZEOF__FaultingExceptionFrame (0x40 + SIZEOF__CONTEXT) ASMCONSTANTS_C_ASSERT(SIZEOF__FaultingExceptionFrame == sizeof(FaultingExceptionFrame)); diff --git a/src/coreclr/vm/amd64/unixstubs.cpp b/src/coreclr/vm/amd64/unixstubs.cpp index 8fdcfd15a9b3eb..77da7da8bfebd5 100644 --- a/src/coreclr/vm/amd64/unixstubs.cpp +++ b/src/coreclr/vm/amd64/unixstubs.cpp @@ -10,32 +10,6 @@ extern "C" PORTABILITY_ASSERT("Implement for PAL"); } -#if !__has_builtin(__cpuid) - void __cpuid(int cpuInfo[4], int function_id) - { - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id) - ); - } -#endif - -#if !__has_builtin(__cpuidex) - void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) - { - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id), "2"(subFunction_id) - ); - } -#endif - DWORD xmmYmmStateSupport() { DWORD eax; diff --git a/src/coreclr/vm/cgensys.h b/src/coreclr/vm/cgensys.h index 75f266be916023..b7cc4c715a51b0 100644 --- a/src/coreclr/vm/cgensys.h +++ b/src/coreclr/vm/cgensys.h @@ -93,13 +93,6 @@ inline void GetSpecificCpuInfo(CORINFO_CPU * cpuInfo) #endif // !TARGET_X86 #if (defined(TARGET_X86) || defined(TARGET_AMD64)) -#ifdef TARGET_UNIX -// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures -// We define matching signatures for use on Unix platforms. - -extern "C" void __cpuid(int cpuInfo[4], int function_id); -extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); -#endif // TARGET_UNIX extern "C" DWORD xmmYmmStateSupport(); extern "C" DWORD avx512StateSupport(); #endif diff --git a/src/coreclr/vm/i386/cgenx86.cpp b/src/coreclr/vm/i386/cgenx86.cpp index 020593b8735361..8d17ace05de965 100644 --- a/src/coreclr/vm/i386/cgenx86.cpp +++ b/src/coreclr/vm/i386/cgenx86.cpp @@ -1133,32 +1133,6 @@ extern "C" DWORD avx512StateSupport() #else // !TARGET_UNIX -#if !__has_builtin(__cpuid) -void __cpuid(int cpuInfo[4], int function_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid" - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id) - ); -} -#endif - -#if !__has_builtin(__cpuidex) -void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid" - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id), "2"(subFunction_id) - ); -} -#endif - extern "C" DWORD xmmYmmStateSupport() { DWORD eax; From a1779a88d6cb6462aa7d0152f01b3e59d7100002 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 11:19:16 -0700 Subject: [PATCH 02/16] Ensure compilation works when XSTATE_SUPPORTED isn't defined --- src/coreclr/pal/src/thread/context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 68436ab4289a3d..1fecd355cea708 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -318,7 +318,9 @@ typedef int __ptrace_request; #if !HAVE_MACH_EXCEPTIONS +#ifdef XSTATE_SUPPORTED static Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; +#endif // XSTATE_SUPPORTED #if !__has_builtin(__cpuid) void __cpuid(int cpuInfo[4], int function_id) From cf9164102e967998aebc2d42f5b6def8def4fdfe Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 11:32:28 -0700 Subject: [PATCH 03/16] Ensure __cpuid isn't implemented on Arm32/Arm64 --- src/coreclr/pal/src/thread/context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 1fecd355cea708..7e4897c52e1d13 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -322,6 +322,7 @@ typedef int __ptrace_request; static Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; #endif // XSTATE_SUPPORTED +#if defined(HOST_X86) || defined(HOST_AMD64) #if !__has_builtin(__cpuid) void __cpuid(int cpuInfo[4], int function_id) { @@ -347,6 +348,7 @@ void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) ); } #endif +#endif // HOST_x86 || HOST_AMD64 /*++ Function: From c91ce32342866602f09c0f040e294e364f4bd47f Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 12:25:53 -0700 Subject: [PATCH 04/16] Fixing a couple of build failures --- src/coreclr/pal/src/include/pal/context.h | 6 +++--- src/coreclr/vm/amd64/asmconstants.h | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index 0e0346c2ebfd05..46a1ef61824d4e 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -851,7 +851,7 @@ inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *sz) _ASSERTE(sz != nullptr); *sz = sizeof(_STRUCT_OPMASK_REG) * 8; - return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_k0); + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_k0); } inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) @@ -860,7 +860,7 @@ inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) _ASSERTE(sz != nullptr); *sz = sizeof(_STRUCT_YMM_REG) * 16; - return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_zmmh0); + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmmh0); } inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) @@ -869,7 +869,7 @@ inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) _ASSERTE(sz != nullptr); *sz = sizeof(_STRUCT_ZMM_REG) * 16; - return reinterpret_cast(&((_STRUCT_MCONTEXT_AVX512_64&)FPSTATE(uc)).__fpu_zmm16); + return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmm16); } #else //TARGET_OSX diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index a3d3d4447f2115..fec8af866a0cf6 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -285,7 +285,9 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__MethodDesc__m_wFlags == offsetof(MethodDesc, m_w ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub == offsetof(VASigCookie, pNDirectILStub)); -#define SIZEOF__CONTEXT (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) +// Expression is too complicated, is currently: +// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) +#define SIZEOF__CONTEXT (3136) ASMCONSTANTS_C_ASSERT(SIZEOF__CONTEXT == sizeof(CONTEXT)); From 80ad388b345a366d9998e446702005309db10308 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 12:48:57 -0700 Subject: [PATCH 05/16] Fixing some OSX failures --- src/coreclr/pal/src/arch/amd64/asmconstants.h | 15 +++++++--- src/coreclr/pal/src/arch/amd64/context2.S | 2 +- src/coreclr/pal/src/thread/context.cpp | 28 +++++++++---------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/coreclr/pal/src/arch/amd64/asmconstants.h b/src/coreclr/pal/src/arch/amd64/asmconstants.h index b40d26ce6c9b2c..5a5184b1ed95f0 100644 --- a/src/coreclr/pal/src/arch/amd64/asmconstants.h +++ b/src/coreclr/pal/src/arch/amd64/asmconstants.h @@ -3,10 +3,17 @@ #ifdef HOST_64BIT -#define XFEATURE_MASK_OPMASK (1 << 5) -#define XFEATURE_MASK_ZMM_Hi256 (1 << 6) -#define XFEATURE_MASK_Hi16_ZMM (1 << 7) -#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) +#define XSTATE_GSSE (2) +#define XSTATE_AVX (XSTATE_GSSE) +#define XSTATE_AVX512_KMASK (5) +#define XSTATE_AVX512_ZMM_H (6) +#define XSTATE_AVX512_ZMM (7) + +#define XSTATE_MASK_GSSE (1 << (XSTATE_GSSE)) +#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE) +#define XSTATE_MASK_AVX512 ((1 << (XSTATE_AVX512_KMASK)) | \ + (1 << (XSTATE_AVX512_ZMM_H)) | \ + (1 << (XSTATE_AVX512_ZMM))) // The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not // relevant, the arch bit is excluded from the flag constants below for simpler tests. diff --git a/src/coreclr/pal/src/arch/amd64/context2.S b/src/coreclr/pal/src/arch/amd64/context2.S index a35b7365fc78f3..64ef2b524465a7 100644 --- a/src/coreclr/pal/src/arch/amd64/context2.S +++ b/src/coreclr/pal/src/arch/amd64/context2.S @@ -125,7 +125,7 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT): vinsertf128 ymm14, ymm14, xmmword ptr [rdi + (CONTEXT_Ymm0H + 14 * 16)], 1 vinsertf128 ymm15, ymm15, xmmword ptr [rdi + (CONTEXT_Ymm0H + 15 * 16)], 1 - test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XFEATURE_MASK_AVX512 + test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XSTATE_MASK_AVX512 je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) // Restore the Opmask state diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 7e4897c52e1d13..161d26a2b3486d 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -716,7 +716,7 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) { if (FPREG_HasYmmRegisters(native)) { - _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); uint32_t size; void *dest; @@ -727,7 +727,7 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) if (FPREG_HasAvx512Registers(native)) { - _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512); + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512); dest = FPREG_Xstate_Opmask(native, &size); _ASSERT(size == (sizeof(DWORD64) * 8)); @@ -918,7 +918,7 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex _ASSERT(size == (sizeof(M128A) * 16)); memcpy_s(&lpContext->Ymm0H, sizeof(M128A) * 16, src, sizeof(M128A) * 16); - lpContext->XStateFeaturesMask |= XFEATURE_MASK_YMM; + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX; if (FPREG_HasAvx512Registers(native)) { @@ -934,7 +934,7 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex _ASSERT(size == (sizeof(M512A) * 16)); memcpy_s(&lpContext->Zmm16, sizeof(M512A) * 16, src, sizeof(M512A) * 16); - lpContext->XStateFeaturesMask |= XFEATURE_MASK_AVX512; + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; } } else @@ -1394,11 +1394,11 @@ CONTEXT_GetThreadContextFromThreadState( { x86_avx512_state64_t *pState = (x86_avx512_state64_t *)threadState; - memcpy_s(&lpContext->KMask0, sizeof(opmask_reg) * 8, &pState->__fpu_k0, sizeof(opmask_reg) * 8); - memcpy_s(&lpContext->Zmm0H, sizeof(ymm_reg) * 16, &pState->__fpu_zmmh0, sizeof(ymm_reg) * 16); - memcpy_s(&lpContext->Zmm16, sizeof(zmm_reg) * 16, &pState->__fpu_zmm16, sizeof(zmm_reg) * 16); + memcpy(&lpContext->KMask0, &pState->__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&lpContext->Zmm0H, &pState->__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&lpContext->Zmm16, &pState->__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16); - lpContext->XStateFeaturesMask |= XFEATURE_MASK_AVX512; + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; } // Intentional fall-through, the AVX512 states are supersets of the AVX state @@ -1410,8 +1410,8 @@ CONTEXT_GetThreadContextFromThreadState( if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState; - memcpy_s(&lpContext->Ymm0H, sizeof(xmm_reg) * 16, &pState->__fpu_ymmh0, sizeof(xmm_reg) * 16); - lpContext->XStateFeaturesMask |= XFEATURE_MASK_YMM; + memcpy_s(&lpContext->Ymm0H, &pState->__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16); + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX; } // Intentional fall-through, the AVX states are supersets of the FLOAT state @@ -1574,14 +1574,14 @@ CONTEXT_SetThreadContextOnPort( x86_avx512_state64_t State; if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - if ((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512) + if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) { StateFlavor = x86_AVX512_STATE64; StateCount = sizeof(State) / sizeof(natural_t); } else { - _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); StateFlavor = x86_AVX_STATE64; StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); } @@ -1637,7 +1637,7 @@ CONTEXT_SetThreadContextOnPort( #if defined(HOST_AMD64) && defined(XSTATE_SUPPORTED) if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - if ((lpContext->XStateFeaturesMask & XFEATURE_MASK_AVX512) == XFEATURE_MASK_AVX512) + if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) { memcpy_s(&State.__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8, lpContext->KMask0, sizeof(_STRUCT_OPMASK_REG) * 8); @@ -1647,7 +1647,7 @@ CONTEXT_SetThreadContextOnPort( sizeof(_STRUCT_ZMM_REG) * 16); } - _ASSERT((lpContext->XStateFeaturesMask & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM); + _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); memcpy_s(&State.__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16, lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); } From c7898c8cbc81bdbcccbe931859d7681d9229504c Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 13:07:59 -0700 Subject: [PATCH 06/16] More OSX fixes --- src/coreclr/pal/inc/pal.h | 8 ++++---- src/coreclr/pal/src/thread/context.cpp | 14 +++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index d89734c12bcf1f..b7c59f70a7f49b 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -1501,11 +1501,11 @@ extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); #define XSTATE_AVX512_ZMM_H (6) #define XSTATE_AVX512_ZMM (7) -#define XSTATE_MASK_GSSE (1ui64 << (XSTATE_GSSE)) +#define XSTATE_MASK_GSSE (UI64(1) << (XSTATE_GSSE)) #define XSTATE_MASK_AVX (XSTATE_MASK_GSSE) -#define XSTATE_MASK_AVX512 ((1ui64 << (XSTATE_AVX512_KMASK)) | \ - (1ui64 << (XSTATE_AVX512_ZMM_H)) | \ - (1ui64 << (XSTATE_AVX512_ZMM))) +#define XSTATE_MASK_AVX512 ((UI64(1) << (XSTATE_AVX512_KMASK)) | \ + (UI64(1) << (XSTATE_AVX512_ZMM_H)) | \ + (UI64(1) << (XSTATE_AVX512_ZMM))) typedef struct DECLSPEC_ALIGN(16) _M128A { ULONGLONG Low; diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 161d26a2b3486d..c487f5376abcc0 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -1410,7 +1410,7 @@ CONTEXT_GetThreadContextFromThreadState( if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState; - memcpy_s(&lpContext->Ymm0H, &pState->__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16); + memcpy(&lpContext->Ymm0H, &pState->__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16); lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX; } @@ -1639,17 +1639,13 @@ CONTEXT_SetThreadContextOnPort( { if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) { - memcpy_s(&State.__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8, lpContext->KMask0, - sizeof(_STRUCT_OPMASK_REG) * 8); - memcpy_s(&State.__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16, lpContext->Zmm0H, - sizeof(_STRUCT_YMM_REG) * 16); - memcpy_s(&State.__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16, lpContext->Zmm16, - sizeof(_STRUCT_ZMM_REG) * 16); + memcpy(&State.__fpu_k0, lpContext->KMask0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&State.__fpu_zmmh0, lpContext->Zmm0H, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&State.__fpu_zmm16, lpContext->Zmm16, sizeof(_STRUCT_ZMM_REG) * 16); } _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); - memcpy_s(&State.__fpu_ymmh0, sizeof(_STRUCT_XMM_REG) * 16, lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); - + memcpy(&State.__fpu_ymmh0, lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); } #endif From bc780f45bc42c30e1182ca398b7cd5d2b3ffc8ab Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 13:33:17 -0700 Subject: [PATCH 07/16] More build fixes --- src/coreclr/pal/src/thread/context.cpp | 10 +++++----- src/coreclr/vm/amd64/asmconstants.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index c487f5376abcc0..ec9135fbd1bdde 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -319,7 +319,7 @@ typedef int __ptrace_request; #if !HAVE_MACH_EXCEPTIONS #ifdef XSTATE_SUPPORTED -static Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; +Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; #endif // XSTATE_SUPPORTED #if defined(HOST_X86) || defined(HOST_AMD64) @@ -1639,13 +1639,13 @@ CONTEXT_SetThreadContextOnPort( { if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) { - memcpy(&State.__fpu_k0, lpContext->KMask0, sizeof(_STRUCT_OPMASK_REG) * 8); - memcpy(&State.__fpu_zmmh0, lpContext->Zmm0H, sizeof(_STRUCT_YMM_REG) * 16); - memcpy(&State.__fpu_zmm16, lpContext->Zmm16, sizeof(_STRUCT_ZMM_REG) * 16); + memcpy(&State.__fpu_k0, &lpContext->KMask0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&State.__fpu_zmmh0, &lpContext->Zmm0H, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&State.__fpu_zmm16, &lpContext->Zmm16, sizeof(_STRUCT_ZMM_REG) * 16); } _ASSERT((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX) == XSTATE_MASK_AVX); - memcpy(&State.__fpu_ymmh0, lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); + memcpy(&State.__fpu_ymmh0, &lpContext->Ymm0H, sizeof(_STRUCT_XMM_REG) * 16); } #endif diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index fec8af866a0cf6..977fae7791619a 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -285,9 +285,15 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__MethodDesc__m_wFlags == offsetof(MethodDesc, m_w ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub == offsetof(VASigCookie, pNDirectILStub)); +#ifdef UNIX_AMD64_ABI // Expression is too complicated, is currently: // (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) #define SIZEOF__CONTEXT (3136) +#else + // Expression is too complicated, is currently: +// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) +#define SIZEOF__CONTEXT (1232) +#endif ASMCONSTANTS_C_ASSERT(SIZEOF__CONTEXT == sizeof(CONTEXT)); @@ -439,7 +445,11 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__Xmm15 ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__VectorRegister == offsetof(CONTEXT, VectorRegister[0])); +#ifdef UNIX_AMD64_ABI #define SIZEOF__FaultingExceptionFrame (0x40 + SIZEOF__CONTEXT) +#else +#define SIZEOF__FaultingExceptionFrame (0x20 + SIZEOF__CONTEXT) +#endif ASMCONSTANTS_C_ASSERT(SIZEOF__FaultingExceptionFrame == sizeof(FaultingExceptionFrame)); From 6aec50079b438a264b5a90c6d861e9a9a8d0446b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 22 Mar 2023 14:25:29 -0700 Subject: [PATCH 08/16] AltJit and another OSX build fix --- src/coreclr/pal/inc/pal.h | 39 ++++++++++++++++++++------ src/coreclr/pal/src/thread/context.cpp | 28 ------------------ src/coreclr/vm/amd64/asmconstants.h | 6 ++-- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index b7c59f70a7f49b..ff818531ee64ee 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -1315,13 +1315,42 @@ QueueUserAPC( IN HANDLE hThread, IN ULONG_PTR dwData); -#ifdef HOST_X86 - +#if defined(HOST_X86) || defined(HOST_AMD64) // MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures // We define matching signatures for use on Unix platforms. +#if __has_builtin(__cpuid) extern "C" void __cpuid(int cpuInfo[4], int function_id); +#else +inline void __cpuid(int cpuInfo[4], int function_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id) + ); +} +#endif // __cpuid + +#if __has_builtin(__cpuidex) extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); +#else +inline void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) +{ + // Based on the Clang implementation provided in cpuid.h: + // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + + __asm(" cpuid\n" \ + : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ + : "0"(function_id), "2"(subFunction_id) + ); +} +#endif // __cpuidex +#endif // HOST_X86 || HOST_AMD64 + +#ifdef HOST_X86 // // *********************************************************************************** @@ -1468,12 +1497,6 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS { #elif defined(HOST_AMD64) -// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures -// We define matching signatures for use on Unix platforms. - -extern "C" void __cpuid(int cpuInfo[4], int function_id); -extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); - // copied from winnt.h #define CONTEXT_AMD64 0x100000 diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index ec9135fbd1bdde..20964f591841a6 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -322,34 +322,6 @@ typedef int __ptrace_request; Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; #endif // XSTATE_SUPPORTED -#if defined(HOST_X86) || defined(HOST_AMD64) -#if !__has_builtin(__cpuid) -void __cpuid(int cpuInfo[4], int function_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id) - ); -} -#endif - -#if !__has_builtin(__cpuidex) -void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) -{ - // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h - - __asm(" cpuid\n" \ - : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ - : "0"(function_id), "2"(subFunction_id) - ); -} -#endif -#endif // HOST_x86 || HOST_AMD64 - /*++ Function: CONTEXT_GetRegisters diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index 977fae7791619a..48efd1dcb15343 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -285,12 +285,12 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__MethodDesc__m_wFlags == offsetof(MethodDesc, m_w ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub == offsetof(VASigCookie, pNDirectILStub)); -#ifdef UNIX_AMD64_ABI +#if defined(UNIX_AMD64_ABI) && !defined(HOST_WINDOWS) // Expression is too complicated, is currently: // (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) #define SIZEOF__CONTEXT (3136) #else - // Expression is too complicated, is currently: +// Expression is too complicated, is currently: // (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) #define SIZEOF__CONTEXT (1232) #endif @@ -445,7 +445,7 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__Xmm15 ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__VectorRegister == offsetof(CONTEXT, VectorRegister[0])); -#ifdef UNIX_AMD64_ABI +#if defined(UNIX_AMD64_ABI) && !defined(HOST_WINDOWS) #define SIZEOF__FaultingExceptionFrame (0x40 + SIZEOF__CONTEXT) #else #define SIZEOF__FaultingExceptionFrame (0x20 + SIZEOF__CONTEXT) From 3875cfac77fc379a3ec6e6c80bd067f5c309ef59 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 23 Mar 2023 07:20:04 -0700 Subject: [PATCH 09/16] Ensure an assert checks the right bit --- src/coreclr/pal/src/include/pal/context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index 46a1ef61824d4e..125ba0fa5ff392 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -482,7 +482,7 @@ inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *sz, ui _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); __cpuid(cpuidInfo, 0x0000000D); - _ASSERTE((cpuidInfo[CPUID_EAX] & featureIndex) != 0); + _ASSERTE((cpuidInfo[CPUID_EAX] & (1 << featureIndex)) != 0); #endif // _DEBUG __cpuidex(cpuidInfo, 0x0000000D, static_cast(featureIndex)); From d6664dda2dae16b954c6b85d56f764b2c8d68f82 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 23 Mar 2023 07:23:03 -0700 Subject: [PATCH 10/16] Fixing an assert on OSX to account for general XSTATE tracking --- src/coreclr/pal/src/include/pal/context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index 125ba0fa5ff392..e86f8693ea1c92 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -841,7 +841,7 @@ inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *sz) inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) { - _ASSERTE(uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); + _ASSERTE((uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX64)) || (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64))); return (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); } From ac794a3392c8d7be47f447a77d6184b97f6a0299 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 23 Mar 2023 07:30:17 -0700 Subject: [PATCH 11/16] Addressing PR feedback --- src/coreclr/pal/src/include/pal/context.h | 54 +++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index e86f8693ea1c92..2399a6fd9db7e7 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -359,19 +359,19 @@ using asm_sigcontext::_xstate; // the processor xstate bit vector register, so the value is OS independent. #ifndef XFEATURE_MASK_YMM -#define XFEATURE_MASK_YMM (1 << 2) +#define XFEATURE_MASK_YMM (1 << XSTATE_AVX) #endif // XFEATURE_MASK_YMM #ifndef XFEATURE_MASK_OPMASK -#define XFEATURE_MASK_OPMASK (1 << 5) +#define XFEATURE_MASK_OPMASK (1 << XSTATE_AVX512_KMASK) #endif // XFEATURE_MASK_OPMASK #ifndef XFEATURE_MASK_ZMM_Hi256 -#define XFEATURE_MASK_ZMM_Hi256 (1 << 6) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XSTATE_AVX512_ZMM_H) #endif // XFEATURE_MASK_ZMM_Hi256 #ifndef XFEATURE_MASK_Hi16_ZMM -#define XFEATURE_MASK_Hi16_ZMM (1 << 7) +#define XFEATURE_MASK_Hi16_ZMM (1 << XSTATE_AVX512_ZMM) #endif // XFEATURE_MASK_Hi16_ZMM #ifndef XFEATURE_MASK_AVX512 @@ -458,9 +458,9 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) return (FPREG_FpxSwBytes_xfeatures(uc) & XFEATURE_MASK_YMM) == XFEATURE_MASK_YMM; } -inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *sz, uint32_t featureIndex) +inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *featureSize, uint32_t featureIndex) { - _ASSERTE(sz != nullptr); + _ASSERTE(featureSize != nullptr); _ASSERTE(featureIndex < (sizeof(Xstate_ExtendedFeatures) / sizeof(Xstate_ExtendedFeature))); _ASSERT(FPREG_Xstate_ExtendedStateArea_Offset == 576); @@ -496,14 +496,14 @@ inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *sz, ui extendedFeature->initialized = true; } - *sz = extendedFeature->size; + *featureSize = extendedFeature->size; return (FPREG_Xstate_ExtendedStateArea(uc) + extendedFeature->offset); } -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasYmmRegisters(uc)); - return FPREG_Xstate_ExtendedFeature(uc, sz, 2); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX); } inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) @@ -522,22 +522,22 @@ inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) return true; } -inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - return FPREG_Xstate_ExtendedFeature(uc, sz, 5); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_KMASK); } -inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - return FPREG_Xstate_ExtendedFeature(uc, sz, 6); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_ZMM_H); } -inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - return FPREG_Xstate_ExtendedFeature(uc, sz, 7); + return FPREG_Xstate_ExtendedFeature(uc, featureSize, XSTATE_AVX512_ZMM); } #endif // XSTATE_SUPPORTED @@ -830,12 +830,12 @@ inline bool FPREG_HasYmmRegisters(const ucontext_t *uc) static_assert_no_msg(offsetof(_STRUCT_X86_AVX_STATE64, __fpu_ymmh0) == offsetof(_STRUCT_X86_AVX512_STATE64, __fpu_ymmh0)); -inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Ymmh(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasYmmRegisters(uc)); - _ASSERTE(sz != nullptr); + _ASSERTE(featureSize != nullptr); - *sz = sizeof(_STRUCT_XMM_REG) * 16; + *featureSize = sizeof(_STRUCT_XMM_REG) * 16; return reinterpret_cast(&((_STRUCT_X86_AVX_STATE64&)FPSTATE(uc)).__fpu_ymmh0); } @@ -845,30 +845,30 @@ inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) return (uc->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX512_64)); } -inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - _ASSERTE(sz != nullptr); + _ASSERTE(featureSize != nullptr); - *sz = sizeof(_STRUCT_OPMASK_REG) * 8; + *featureSize = sizeof(_STRUCT_OPMASK_REG) * 8; return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_k0); } -inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_ZmmHi256(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - _ASSERTE(sz != nullptr); + _ASSERTE(featureSize != nullptr); - *sz = sizeof(_STRUCT_YMM_REG) * 16; + *featureSize = sizeof(_STRUCT_YMM_REG) * 16; return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmmh0); } -inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *sz) +inline void *FPREG_Xstate_Hi16Zmm(const ucontext_t *uc, uint32_t *featureSize) { _ASSERTE(FPREG_HasAvx512Registers(uc)); - _ASSERTE(sz != nullptr); + _ASSERTE(featureSize != nullptr); - *sz = sizeof(_STRUCT_ZMM_REG) * 16; + *featureSize = sizeof(_STRUCT_ZMM_REG) * 16; return reinterpret_cast(&((_STRUCT_X86_AVX512_STATE64&)FPSTATE(uc)).__fpu_zmm16); } #else //TARGET_OSX From 9c841a072326e0965a7ef9416c8318bed80b1ef5 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 23 Mar 2023 09:33:43 -0700 Subject: [PATCH 12/16] Various fixups and bug fixes to appease CI --- src/coreclr/gc/vxsort/isa_detection.cpp | 4 +- src/coreclr/nativeaot/Runtime/PalRedhawk.h | 3 + .../nativeaot/Runtime/unix/PalRedhawkUnix.cpp | 41 ++++++- src/coreclr/pal/inc/pal.h | 84 +++++++------- src/coreclr/pal/src/arch/amd64/asmconstants.h | 5 +- src/coreclr/pal/src/arch/amd64/context2.S | 26 +++-- src/coreclr/pal/src/include/pal/context.h | 14 ++- src/coreclr/pal/src/thread/context.cpp | 104 ++++++++++++++---- src/coreclr/vm/amd64/asmconstants.h | 8 +- src/coreclr/vm/amd64/unixstubs.cpp | 31 ++++++ 10 files changed, 228 insertions(+), 92 deletions(-) diff --git a/src/coreclr/gc/vxsort/isa_detection.cpp b/src/coreclr/gc/vxsort/isa_detection.cpp index 2a60ea01207a9f..1dcb7913a86961 100644 --- a/src/coreclr/gc/vxsort/isa_detection.cpp +++ b/src/coreclr/gc/vxsort/isa_detection.cpp @@ -77,11 +77,11 @@ SupportedISA DetermineSupportedISA() // get xcr0 register DWORD64 xcr0 = _xgetbv(0); - // get OS XState info + // get OS XState info DWORD64 FeatureMask = GetEnabledXStateFeaturesHelper(); // get processor extended feature flag info - __cpuid(reg, 7); + __cpuidex(reg, 7, 0); // check if all of AVX2, AVX512F and AVX512DQ are supported by both processor and OS if ((reg[EBX] & (AVX2 | AVX512F | AVX512DQ)) == (AVX2 | AVX512F | AVX512DQ) && diff --git a/src/coreclr/nativeaot/Runtime/PalRedhawk.h b/src/coreclr/nativeaot/Runtime/PalRedhawk.h index 44ba7ea15b0ec8..986ad2ac2c89f9 100644 --- a/src/coreclr/nativeaot/Runtime/PalRedhawk.h +++ b/src/coreclr/nativeaot/Runtime/PalRedhawk.h @@ -795,6 +795,9 @@ REDHAWK_PALIMPORT int32_t __cdecl _stricmp(const char *string1, const char *stri #ifdef TARGET_UNIX // MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures // We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid + REDHAWK_PALIMPORT void __cpuid(int cpuInfo[4], int function_id); REDHAWK_PALIMPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); #else diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp index cfe2502987c675..08cdad4c6d6ff6 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp @@ -1031,7 +1031,7 @@ static void ActivationHandler(int code, siginfo_t* siginfo, void* context) #endif )) { - // Make sure that errno is not modified + // Make sure that errno is not modified int savedErrNo = errno; g_pHijackCallback((NATIVE_CONTEXT*)context, NULL); errno = savedErrNo; @@ -1275,12 +1275,16 @@ extern "C" uint64_t PalGetCurrentThreadIdForLogging() } #if defined(HOST_X86) || defined(HOST_AMD64) +// MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures +// We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid #if !__has_builtin(__cpuid) REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1293,7 +1297,7 @@ REDHAWK_PALEXPORT void __cpuid(int cpuInfo[4], int function_id) REDHAWK_PALEXPORT void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1314,8 +1318,38 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI xmmYmmStateSupport() return ((eax & 0x06) == 0x06) ? 1 : 0; } +#ifndef XSTATE_MASK_AVX512 +#define XSTATE_MASK_AVX512 (0xE0) /* 0b1110_0000 */ +#endif // XSTATE_MASK_AVX512 + REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() { +#if defined(TARGET_APPLE) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + + __cpuid(cpuidInfo, 0x00000000); + + if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) + { + return false; + } + + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; +#else DWORD eax; __asm(" xgetbv\n" \ : "=a"(eax) /*output in eax*/\ @@ -1324,6 +1358,7 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() ); // check OS has enabled XMM, YMM and ZMM state support return ((eax & 0xE6) == 0x0E6) ? 1 : 0; +#endif } #endif // defined(HOST_X86) || defined(HOST_AMD64) diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index ff818531ee64ee..37053c772215d8 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -1318,6 +1318,8 @@ QueueUserAPC( #if defined(HOST_X86) || defined(HOST_AMD64) // MSVC directly defines intrinsics for __cpuid and __cpuidex matching the below signatures // We define matching signatures for use on Unix platforms. +// +// IMPORTANT: Unlike MSVC, Unix does not explicitly zero ECX for __cpuid #if __has_builtin(__cpuid) extern "C" void __cpuid(int cpuInfo[4], int function_id); @@ -1325,7 +1327,7 @@ extern "C" void __cpuid(int cpuInfo[4], int function_id); inline void __cpuid(int cpuInfo[4], int function_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1340,7 +1342,7 @@ extern "C" void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id); inline void __cpuidex(int cpuInfo[4], int function_id, int subFunction_id) { // Based on the Clang implementation provided in cpuid.h: - // https://github.com/llvm/llvm-project/blob/master/clang/lib/Headers/cpuid.h + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/cpuid.h __asm(" cpuid\n" \ : "=a"(cpuInfo[0]), "=b"(cpuInfo[1]), "=c"(cpuInfo[2]), "=d"(cpuInfo[3]) \ @@ -1535,15 +1537,15 @@ typedef struct DECLSPEC_ALIGN(16) _M128A { LONGLONG High; } M128A, *PM128A; -typedef struct DECLSPEC_ALIGN(32) _M256A { +typedef struct DECLSPEC_ALIGN(16) _M256 { M128A Low; M128A High; -} M256A, *PM256A; +} M256, *PM256; -typedef struct DECLSPEC_ALIGN(64) _M512A { - M256A Low; - M256A High; -} M512A, *PM512A; +typedef struct DECLSPEC_ALIGN(16) _M512 { + M256 Low; + M256 High; +} M512, *PM512; typedef struct _XMM_SAVE_AREA32 { WORD ControlWord; @@ -1720,44 +1722,42 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT { // XSTATE_AVX512_ZMM_H struct { - M256A Zmm0H; - M256A Zmm1H; - M256A Zmm2H; - M256A Zmm3H; - M256A Zmm4H; - M256A Zmm5H; - M256A Zmm6H; - M256A Zmm7H; - M256A Zmm8H; - M256A Zmm9H; - M256A Zmm10H; - M256A Zmm11H; - M256A Zmm12H; - M256A Zmm13H; - M256A Zmm14H; - M256A Zmm15H; + M256 Zmm0H; + M256 Zmm1H; + M256 Zmm2H; + M256 Zmm3H; + M256 Zmm4H; + M256 Zmm5H; + M256 Zmm6H; + M256 Zmm7H; + M256 Zmm8H; + M256 Zmm9H; + M256 Zmm10H; + M256 Zmm11H; + M256 Zmm12H; + M256 Zmm13H; + M256 Zmm14H; + M256 Zmm15H; }; - DWORD64 XStateReserved1[4]; - // XSTATE_AVX512_ZMM struct { - M512A Zmm16; - M512A Zmm17; - M512A Zmm18; - M512A Zmm19; - M512A Zmm20; - M512A Zmm21; - M512A Zmm22; - M512A Zmm23; - M512A Zmm24; - M512A Zmm25; - M512A Zmm26; - M512A Zmm27; - M512A Zmm28; - M512A Zmm29; - M512A Zmm30; - M512A Zmm31; + M512 Zmm16; + M512 Zmm17; + M512 Zmm18; + M512 Zmm19; + M512 Zmm20; + M512 Zmm21; + M512 Zmm22; + M512 Zmm23; + M512 Zmm24; + M512 Zmm25; + M512 Zmm26; + M512 Zmm27; + M512 Zmm28; + M512 Zmm29; + M512 Zmm30; + M512 Zmm31; }; } CONTEXT, *PCONTEXT, *LPCONTEXT; diff --git a/src/coreclr/pal/src/arch/amd64/asmconstants.h b/src/coreclr/pal/src/arch/amd64/asmconstants.h index 5a5184b1ed95f0..d5a72cf6eda23a 100644 --- a/src/coreclr/pal/src/arch/amd64/asmconstants.h +++ b/src/coreclr/pal/src/arch/amd64/asmconstants.h @@ -89,9 +89,8 @@ #define CONTEXT_XStateReserved0 CONTEXT_XStateFeaturesMask+8 #define CONTEXT_Ymm0H CONTEXT_XStateReserved0+8 #define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16) -#define CONTEXT_Zmm0H CONTEXT_Ymm0H+(8*8) -#define CONTEXT_XStateReserved1 CONTEXT_Zmm0H+(32*16) -#define CONTEXT_Zmm16 CONTEXT_XStateReserved1+(8*4) +#define CONTEXT_Zmm0H CONTEXT_KMask0+(8*8) +#define CONTEXT_Zmm16 CONTEXT_Zmm0H+(32*16) #define CONTEXT_Size CONTEXT_Zmm16+(64*16) #else // HOST_64BIT diff --git a/src/coreclr/pal/src/arch/amd64/context2.S b/src/coreclr/pal/src/arch/amd64/context2.S index 64ef2b524465a7..e4ab8ac1b19c3c 100644 --- a/src/coreclr/pal/src/arch/amd64/context2.S +++ b/src/coreclr/pal/src/arch/amd64/context2.S @@ -128,15 +128,13 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT): test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XSTATE_MASK_AVX512 je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE) - // Restore the Opmask state - kmovq k0, qword ptr [rdi + (CONTEXT_KMask0 + 0 * 8)] - kmovq k1, qword ptr [rdi + (CONTEXT_KMask0 + 1 * 8)] - kmovq k2, qword ptr [rdi + (CONTEXT_KMask0 + 2 * 8)] - kmovq k3, qword ptr [rdi + (CONTEXT_KMask0 + 3 * 8)] - kmovq k4, qword ptr [rdi + (CONTEXT_KMask0 + 4 * 8)] - kmovq k5, qword ptr [rdi + (CONTEXT_KMask0 + 5 * 8)] - kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)] - kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)] + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 // Restore the ZMM_Hi256 state vinsertf64x4 zmm0, zmm0, ymmword ptr [rdi + (CONTEXT_Zmm0H + 0 * 32)], 1 @@ -174,6 +172,16 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT): vmovups zmm30, zmmword ptr [rdi + (CONTEXT_Zmm16 + 14 * 64)] vmovups zmm31, zmmword ptr [rdi + (CONTEXT_Zmm16 + 15 * 64)] + // Restore the Opmask state + kmovq k0, qword ptr [rdi + (CONTEXT_KMask0 + 0 * 8)] + kmovq k1, qword ptr [rdi + (CONTEXT_KMask0 + 1 * 8)] + kmovq k2, qword ptr [rdi + (CONTEXT_KMask0 + 2 * 8)] + kmovq k3, qword ptr [rdi + (CONTEXT_KMask0 + 3 * 8)] + kmovq k4, qword ptr [rdi + (CONTEXT_KMask0 + 4 * 8)] + kmovq k5, qword ptr [rdi + (CONTEXT_KMask0 + 5 * 8)] + kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)] + kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)] + LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE): test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_CONTROL diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index 2399a6fd9db7e7..0295481632d1cb 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -54,6 +54,10 @@ using asm_sigcontext::_xstate; #include #endif // !HAVE_MACH_EXCEPTIONS else +#if defined(XSTATE_SUPPORTED) || (defined(HOST_64BIT) && defined(TARGET_OSX)) +bool Xstate_IsAvx512Supported(); +#endif // XSTATE_SUPPORTED || (HOST_64BIT && TARGET_OSX) + #ifdef HOST_S390X #define MCREG_PSWMask(mc) ((mc).psw.mask) @@ -400,9 +404,7 @@ struct Xstate_ExtendedFeature uint32_t size; }; -// XFEATURE_Hi16_ZMM is currently the largest we need and is index 7 -#define Xstate_ExtendedFeatures_Count (7) - +#define Xstate_ExtendedFeatures_Count (XSTATE_AVX512_ZMM + 1) extern Xstate_ExtendedFeature Xstate_ExtendedFeatures[Xstate_ExtendedFeatures_Count]; inline _fpx_sw_bytes *FPREG_FpxSwBytes(const ucontext_t *uc) @@ -481,14 +483,14 @@ inline void *FPREG_Xstate_ExtendedFeature(const ucontext_t *uc, uint32_t *featur __cpuid(cpuidInfo, 0x00000000); _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); - __cpuid(cpuidInfo, 0x0000000D); + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); _ASSERTE((cpuidInfo[CPUID_EAX] & (1 << featureIndex)) != 0); #endif // _DEBUG __cpuidex(cpuidInfo, 0x0000000D, static_cast(featureIndex)); _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) > 0); - _ASSERTE(static_cast(cpuidInfo[CPUID_EBX]) > FPREG_Xstate_ExtendedStateArea_Offset); + _ASSERTE(static_cast(cpuidInfo[CPUID_EBX]) >= FPREG_Xstate_ExtendedStateArea_Offset); extendedFeature->size = static_cast(cpuidInfo[CPUID_EAX]); extendedFeature->offset = static_cast(cpuidInfo[CPUID_EBX] - FPREG_Xstate_ExtendedStateArea_Offset); @@ -519,7 +521,7 @@ inline bool FPREG_HasAvx512Registers(const ucontext_t *uc) } _ASSERTE(FPREG_HasYmmRegisters(uc)); - return true; + return Xstate_IsAvx512Supported(); } inline void *FPREG_Xstate_Opmask(const ucontext_t *uc, uint32_t *featureSize) diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 20964f591841a6..c802b9470e6f1e 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,6 +316,60 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ +#if defined(XSTATE_SUPPORTED) || (defined(HOST_64BIT) && defined(TARGET_OSX)) +bool Xstate_IsAvx512Supported() +{ + static int Xstate_Avx512Supported = -1; + + if (Xstate_Avx512Supported == -1) + { + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + +#ifdef _DEBUG + // We should only be calling this function if we know the extended feature exists + __cpuid(cpuidInfo, 0x00000000); + _ASSERTE(static_cast(cpuidInfo[CPUID_EAX]) >= 0x0D); +#endif // _DEBUG + + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + + if ((cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) + { + // Knight's Landing and Knight's Mill shipped without all 5 of the "baseline" + // AVX-512 ISAs that are required by x86-64-v4. Specifically they do not include + // BW, DQ, or VL. RyuJIT currently requires all 5 ISAs to be present so we will + // only enable Avx512 context save/restore when all exist. This requires us to + // query which ISAs are actually supported to ensure they're all present. + + __cpuidex(cpuidInfo, 0x00000007, 0x00000000); + + const int requiredAvx512Flags = (1 << 16) | // AVX512F + (1 << 17) | // AVX512DQ + (1 << 28) | // AVX512CD + (1 << 30) | // AVX512BW + (1 << 31); // AVX512VL + + if ((cpuidInfo[CPUID_EBX] & requiredAvx512Flags) == requiredAvx512Flags) + { + Xstate_Avx512Supported = 1; + } + } + + if (Xstate_Avx512Supported == -1) + { + Xstate_Avx512Supported = 0; + } + } + + return Xstate_Avx512Supported == 1; +} +#endif // XSTATE_SUPPORTED || (HOST_64BIT && TARGET_OSX) + #if !HAVE_MACH_EXCEPTIONS #ifdef XSTATE_SUPPORTED @@ -706,12 +760,12 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) memcpy_s(dest, sizeof(DWORD64) * 8, &lpContext->KMask0, sizeof(DWORD64) * 8); dest = FPREG_Xstate_ZmmHi256(native, &size); - _ASSERT(size == (sizeof(M256A) * 16)); - memcpy_s(dest, sizeof(M256A) * 16, &lpContext->Zmm0H, sizeof(M256A) * 16); + _ASSERT(size == (sizeof(M256) * 16)); + memcpy_s(dest, sizeof(M256) * 16, &lpContext->Zmm0H, sizeof(M256) * 16); dest = FPREG_Xstate_Hi16Zmm(native, &size); - _ASSERT(size == (sizeof(M512A) * 16)); - memcpy_s(dest, sizeof(M512A) * 16, &lpContext->Zmm16, sizeof(M512A) * 16); + _ASSERT(size == (sizeof(M512) * 16)); + memcpy_s(dest, sizeof(M512) * 16, &lpContext->Zmm16, sizeof(M512) * 16); } } } @@ -899,12 +953,12 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex memcpy_s(&lpContext->KMask0, sizeof(DWORD64) * 8, src, sizeof(DWORD64) * 8); src = FPREG_Xstate_ZmmHi256(native, &size); - _ASSERT(size == (sizeof(M256A) * 16)); - memcpy_s(&lpContext->Zmm0H, sizeof(M256A) * 16, src, sizeof(M256A) * 16); + _ASSERT(size == (sizeof(M256) * 16)); + memcpy_s(&lpContext->Zmm0H, sizeof(M256) * 16, src, sizeof(M256) * 16); src = FPREG_Xstate_Hi16Zmm(native, &size); - _ASSERT(size == (sizeof(M512A) * 16)); - memcpy_s(&lpContext->Zmm16, sizeof(M512A) * 16, src, sizeof(M512A) * 16); + _ASSERT(size == (sizeof(M512) * 16)); + memcpy_s(&lpContext->Zmm16, sizeof(M512) * 16, src, sizeof(M512) * 16); lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; } @@ -1266,23 +1320,29 @@ CONTEXT_GetThreadContextFromPort( x86_avx512_state64_t State; - StateFlavor = x86_AVX_STATE64; - StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); + StateFlavor = x86_AVX512_STATE64; + StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { - // The AVX state is not available, try to get the AVX512 state. - StateFlavor = x86_AVX512_STATE64; - StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); + // The AVX512 state is not available, try to get the AVX state. + lpContext->XStateFeaturesMask &= ~XSTATE_MASK_AVX512; + + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { - // Neither the AVX nor the AVX512 state is not available, try to get at least the FLOAT state. + // Neither the AVX512 nor the AVX state is not available, try to get at least the FLOAT state. + lpContext->XStateFeaturesMask &= ~XSTATE_MASK_AVX; lpContext->ContextFlags &= ~(CONTEXT_XSTATE & CONTEXT_AREA_MASK); StateFlavor = x86_FLOAT_STATE64; StateCount = sizeof(x86_float_state64_t) / sizeof(natural_t); MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) { // We were unable to get any floating point state. This case was observed on OSX with AVX512 capable processors. @@ -1364,13 +1424,16 @@ CONTEXT_GetThreadContextFromThreadState( { if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { - x86_avx512_state64_t *pState = (x86_avx512_state64_t *)threadState; + if (Xstate_IsAvx512Supported()) + { + x86_avx512_state64_t *pState = (x86_avx512_state64_t *)threadState; - memcpy(&lpContext->KMask0, &pState->__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8); - memcpy(&lpContext->Zmm0H, &pState->__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16); - memcpy(&lpContext->Zmm16, &pState->__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16); + memcpy(&lpContext->KMask0, &pState->__fpu_k0, sizeof(_STRUCT_OPMASK_REG) * 8); + memcpy(&lpContext->Zmm0H, &pState->__fpu_zmmh0, sizeof(_STRUCT_YMM_REG) * 16); + memcpy(&lpContext->Zmm16, &pState->__fpu_zmm16, sizeof(_STRUCT_ZMM_REG) * 16); - lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; + lpContext->XStateFeaturesMask |= XSTATE_MASK_AVX512; + } } // Intentional fall-through, the AVX512 states are supersets of the AVX state @@ -1529,7 +1592,6 @@ CONTEXT_SetThreadContextOnPort( if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK) { - #ifdef HOST_AMD64 #ifdef XSTATE_SUPPORTED // We're relying on the fact that the initial portion of @@ -1549,7 +1611,7 @@ CONTEXT_SetThreadContextOnPort( if ((lpContext->XStateFeaturesMask & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512) { StateFlavor = x86_AVX512_STATE64; - StateCount = sizeof(State) / sizeof(natural_t); + StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t); } else { diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index 48efd1dcb15343..22082547234275 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -287,8 +287,8 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__VASigCookie__pNDirectILStub #if defined(UNIX_AMD64_ABI) && !defined(HOST_WINDOWS) // Expression is too complicated, is currently: -// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + 8*4 + /*XSTATE_AVX512_ZMM*/ 64*16) -#define SIZEOF__CONTEXT (3136) +// (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5 + /*XSTATE*/ + 8 + 8 + /*XSTATE_AVX*/ 16*16 + /*XSTATE_AVX512_KMASK*/ 8*8 + /*XSTATE_AVX512_ZMM_H*/ 32*16 + /*XSTATE_AVX512_ZMM*/ 64*16) +#define SIZEOF__CONTEXT (3104) #else // Expression is too complicated, is currently: // (8*6 + 4*2 + 2*6 + 4 + 8*6 + 8*16 + 8 + /*XMM_SAVE_AREA32*/(2*2 + 1*2 + 2 + 4 + 2*2 + 4 + 2*2 + 4*2 + 16*8 + 16*16 + 1*96) + 26*16 + 8 + 8*5) @@ -445,11 +445,7 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__Xmm15 ASMCONSTANTS_C_ASSERT(OFFSETOF__CONTEXT__VectorRegister == offsetof(CONTEXT, VectorRegister[0])); -#if defined(UNIX_AMD64_ABI) && !defined(HOST_WINDOWS) -#define SIZEOF__FaultingExceptionFrame (0x40 + SIZEOF__CONTEXT) -#else #define SIZEOF__FaultingExceptionFrame (0x20 + SIZEOF__CONTEXT) -#endif ASMCONSTANTS_C_ASSERT(SIZEOF__FaultingExceptionFrame == sizeof(FaultingExceptionFrame)); diff --git a/src/coreclr/vm/amd64/unixstubs.cpp b/src/coreclr/vm/amd64/unixstubs.cpp index 77da7da8bfebd5..45dcd7de26c995 100644 --- a/src/coreclr/vm/amd64/unixstubs.cpp +++ b/src/coreclr/vm/amd64/unixstubs.cpp @@ -22,8 +22,38 @@ extern "C" return ((eax & 0x06) == 0x06) ? 1 : 0; } +#ifndef XSTATE_MASK_AVX512 +#define XSTATE_MASK_AVX512 (0xE0) /* 0b1110_0000 */ +#endif // XSTATE_MASK_AVX512 + DWORD avx512StateSupport() { +#if defined(TARGET_OSX) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + int cpuidInfo[4]; + + const int CPUID_EAX = 0; + const int CPUID_EBX = 1; + const int CPUID_ECX = 2; + const int CPUID_EDX = 3; + + __cpuid(cpuidInfo, 0x00000000); + + if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) + { + return false; + } + + __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; +#else DWORD eax; __asm(" xgetbv\n" \ : "=a"(eax) /*output in eax*/\ @@ -32,6 +62,7 @@ extern "C" ); // check OS has enabled XMM, YMM and ZMM state support return ((eax & 0x0E6) == 0x0E6) ? 1 : 0; +#endif } void STDMETHODCALLTYPE JIT_ProfilerEnterLeaveTailcallStub(UINT_PTR ProfilerHandle) From ac62da6c18064c43e2af24663a57aa3d199b7997 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 24 Mar 2023 16:29:26 -0700 Subject: [PATCH 13/16] Use HOST_AMD64 not HOST_64BIT --- src/coreclr/pal/src/include/pal/context.h | 4 ++-- src/coreclr/pal/src/thread/context.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index 0295481632d1cb..e39f2d3f133da3 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -54,9 +54,9 @@ using asm_sigcontext::_xstate; #include #endif // !HAVE_MACH_EXCEPTIONS else -#if defined(XSTATE_SUPPORTED) || (defined(HOST_64BIT) && defined(TARGET_OSX)) +#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(TARGET_OSX)) bool Xstate_IsAvx512Supported(); -#endif // XSTATE_SUPPORTED || (HOST_64BIT && TARGET_OSX) +#endif // XSTATE_SUPPORTED || (HOST_AMD64 && TARGET_OSX) #ifdef HOST_S390X diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index c802b9470e6f1e..8f4274f1a4efd4 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,7 +316,7 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ -#if defined(XSTATE_SUPPORTED) || (defined(HOST_64BIT) && defined(TARGET_OSX)) +#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(TARGET_OSX)) bool Xstate_IsAvx512Supported() { static int Xstate_Avx512Supported = -1; @@ -368,7 +368,7 @@ bool Xstate_IsAvx512Supported() return Xstate_Avx512Supported == 1; } -#endif // XSTATE_SUPPORTED || (HOST_64BIT && TARGET_OSX) +#endif // XSTATE_SUPPORTED || (HOST_AMD64 && TARGET_OSX) #if !HAVE_MACH_EXCEPTIONS From 9a84121af2a54ea908a2d9ccd8a080ed0e41dbd4 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 25 Mar 2023 05:51:05 -0700 Subject: [PATCH 14/16] Explicitly bypass AVX512 support on OSX until thread enablement can be resolved --- .../nativeaot/Runtime/unix/PalRedhawkUnix.cpp | 33 +++++++++++-------- src/coreclr/pal/src/thread/context.cpp | 22 +++++++++++-- src/coreclr/vm/amd64/unixstubs.cpp | 33 +++++++++++-------- 3 files changed, 58 insertions(+), 30 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp index 08cdad4c6d6ff6..73f754e26e84f6 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp @@ -1333,22 +1333,27 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() // // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 - int cpuidInfo[4]; + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems - const int CPUID_EAX = 0; - const int CPUID_EBX = 1; - const int CPUID_ECX = 2; - const int CPUID_EDX = 3; - - __cpuid(cpuidInfo, 0x00000000); - - if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) - { - return false; - } + // int cpuidInfo[4]; + // + // const int CPUID_EAX = 0; + // const int CPUID_EBX = 1; + // const int CPUID_ECX = 2; + // const int CPUID_EDX = 3; + // + // __cpuid(cpuidInfo, 0x00000000); + // + // if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) + // { + // return false; + // } + // + // __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + // return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; - __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); - return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; + return false; #else DWORD eax; __asm(" xgetbv\n" \ diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index 8f4274f1a4efd4..f7dfec3fac774b 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,7 +316,7 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ -#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(TARGET_OSX)) +#if defined(XSTATE_SUPPORTED) bool Xstate_IsAvx512Supported() { static int Xstate_Avx512Supported = -1; @@ -368,7 +368,25 @@ bool Xstate_IsAvx512Supported() return Xstate_Avx512Supported == 1; } -#endif // XSTATE_SUPPORTED || (HOST_AMD64 && TARGET_OSX) +#endif // XSTATE_SUPPORTED + +#if defined(HOST_AMD64) && defined(TARGET_OSX) +bool Xstate_IsAvx512Supported() +{ + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems + + return false; +} +#endif // HOST_AMD64 && TARGET_OSX #if !HAVE_MACH_EXCEPTIONS diff --git a/src/coreclr/vm/amd64/unixstubs.cpp b/src/coreclr/vm/amd64/unixstubs.cpp index 45dcd7de26c995..a2a65944d9b025 100644 --- a/src/coreclr/vm/amd64/unixstubs.cpp +++ b/src/coreclr/vm/amd64/unixstubs.cpp @@ -37,22 +37,27 @@ extern "C" // // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 - int cpuidInfo[4]; + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems - const int CPUID_EAX = 0; - const int CPUID_EBX = 1; - const int CPUID_ECX = 2; - const int CPUID_EDX = 3; - - __cpuid(cpuidInfo, 0x00000000); - - if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) - { - return false; - } + // int cpuidInfo[4]; + // + // const int CPUID_EAX = 0; + // const int CPUID_EBX = 1; + // const int CPUID_ECX = 2; + // const int CPUID_EDX = 3; + // + // __cpuid(cpuidInfo, 0x00000000); + // + // if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) + // { + // return false; + // } + // + // __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); + // return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; - __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); - return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; + return false; #else DWORD eax; __asm(" xgetbv\n" \ From 8dab7e50c0f4f52360a3a1ea7cf2a915d1c3c704 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 25 Mar 2023 08:44:08 -0700 Subject: [PATCH 15/16] Remove commented out code for OSX in favor of just leaving a comment explaining --- .../nativeaot/Runtime/unix/PalRedhawkUnix.cpp | 17 --------- src/coreclr/pal/src/thread/context.cpp | 37 +++++++++---------- src/coreclr/vm/amd64/unixstubs.cpp | 17 --------- 3 files changed, 17 insertions(+), 54 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp index 73f754e26e84f6..f4105485715e56 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp @@ -1336,23 +1336,6 @@ REDHAWK_PALEXPORT uint32_t REDHAWK_PALAPI avx512StateSupport() // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems - // int cpuidInfo[4]; - // - // const int CPUID_EAX = 0; - // const int CPUID_EBX = 1; - // const int CPUID_ECX = 2; - // const int CPUID_EDX = 3; - // - // __cpuid(cpuidInfo, 0x00000000); - // - // if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) - // { - // return false; - // } - // - // __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); - // return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; - return false; #else DWORD eax; diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index f7dfec3fac774b..c1c22ea43ba4bf 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,9 +316,23 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ -#if defined(XSTATE_SUPPORTED) +#if defined(XSTATE_SUPPORTED) || defined(HOST_AMD64) && defined(TARGET_OSX) bool Xstate_IsAvx512Supported() { +#if defined(TARGET_OSX) + // MacOS has specialized behavior where it reports AVX512 support but doesnt + // actually enable AVX512 until the first instruction is executed and does so + // on a per thread basis. It does this by catching the faulting instruction and + // checking for the EVEX encoding. The kmov instructions, despite being part + // of the AVX512 instruction set are VEX encoded and dont trigger the enablement + // + // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 + + // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger + // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems + + return false; +#else static int Xstate_Avx512Supported = -1; if (Xstate_Avx512Supported == -1) @@ -367,26 +381,9 @@ bool Xstate_IsAvx512Supported() } return Xstate_Avx512Supported == 1; +#endif } -#endif // XSTATE_SUPPORTED - -#if defined(HOST_AMD64) && defined(TARGET_OSX) -bool Xstate_IsAvx512Supported() -{ - // MacOS has specialized behavior where it reports AVX512 support but doesnt - // actually enable AVX512 until the first instruction is executed and does so - // on a per thread basis. It does this by catching the faulting instruction and - // checking for the EVEX encoding. The kmov instructions, despite being part - // of the AVX512 instruction set are VEX encoded and dont trigger the enablement - // - // See https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/fpu.c#L174 - - // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger - // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems - - return false; -} -#endif // HOST_AMD64 && TARGET_OSX +#endif // XSTATE_SUPPORTED || defined(HOST_AMD64) && defined(TARGET_OSX) #if !HAVE_MACH_EXCEPTIONS diff --git a/src/coreclr/vm/amd64/unixstubs.cpp b/src/coreclr/vm/amd64/unixstubs.cpp index a2a65944d9b025..d5bb054c9be5b7 100644 --- a/src/coreclr/vm/amd64/unixstubs.cpp +++ b/src/coreclr/vm/amd64/unixstubs.cpp @@ -40,23 +40,6 @@ extern "C" // TODO-AVX512: Enabling this for OSX requires ensuring threads explicitly trigger // the AVX-512 enablement so that arbitrary usage doesn't cause downstream problems - // int cpuidInfo[4]; - // - // const int CPUID_EAX = 0; - // const int CPUID_EBX = 1; - // const int CPUID_ECX = 2; - // const int CPUID_EDX = 3; - // - // __cpuid(cpuidInfo, 0x00000000); - // - // if (static_cast(cpuidInfo[CPUID_EAX]) < 0x0D) - // { - // return false; - // } - // - // __cpuidex(cpuidInfo, 0x0000000D, 0x00000000); - // return (cpuidInfo[CPUID_EAX] & XSTATE_MASK_AVX512) == XSTATE_MASK_AVX512; - return false; #else DWORD eax; From a3bbf0e03a3e5817233c54f9d8c66d540935966d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 25 Mar 2023 09:59:33 -0700 Subject: [PATCH 16/16] Use HAVE_MACH_EXCEPTIONS not TARGET_OSX --- src/coreclr/pal/src/include/pal/context.h | 4 ++-- src/coreclr/pal/src/thread/context.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/pal/src/include/pal/context.h b/src/coreclr/pal/src/include/pal/context.h index e39f2d3f133da3..2c860c2b7fac07 100644 --- a/src/coreclr/pal/src/include/pal/context.h +++ b/src/coreclr/pal/src/include/pal/context.h @@ -54,9 +54,9 @@ using asm_sigcontext::_xstate; #include #endif // !HAVE_MACH_EXCEPTIONS else -#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(TARGET_OSX)) +#if defined(XSTATE_SUPPORTED) || (defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS)) bool Xstate_IsAvx512Supported(); -#endif // XSTATE_SUPPORTED || (HOST_AMD64 && TARGET_OSX) +#endif // XSTATE_SUPPORTED || (HOST_AMD64 && HAVE_MACH_EXCEPTIONS) #ifdef HOST_S390X diff --git a/src/coreclr/pal/src/thread/context.cpp b/src/coreclr/pal/src/thread/context.cpp index c1c22ea43ba4bf..0b0229548984c1 100644 --- a/src/coreclr/pal/src/thread/context.cpp +++ b/src/coreclr/pal/src/thread/context.cpp @@ -316,10 +316,10 @@ typedef int __ptrace_request; ASSIGN_CONTROL_REGS \ ASSIGN_INTEGER_REGS \ -#if defined(XSTATE_SUPPORTED) || defined(HOST_AMD64) && defined(TARGET_OSX) +#if defined(XSTATE_SUPPORTED) || defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS) bool Xstate_IsAvx512Supported() { -#if defined(TARGET_OSX) +#if defined(HAVE_MACH_EXCEPTIONS) // MacOS has specialized behavior where it reports AVX512 support but doesnt // actually enable AVX512 until the first instruction is executed and does so // on a per thread basis. It does this by catching the faulting instruction and @@ -383,7 +383,7 @@ bool Xstate_IsAvx512Supported() return Xstate_Avx512Supported == 1; #endif } -#endif // XSTATE_SUPPORTED || defined(HOST_AMD64) && defined(TARGET_OSX) +#endif // XSTATE_SUPPORTED || defined(HOST_AMD64) && defined(HAVE_MACH_EXCEPTIONS) #if !HAVE_MACH_EXCEPTIONS