Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NO REVIEW] Update the CPUID and XSAVE logics for APX #103019

Closed
wants to merge 13 commits into from
3 changes: 3 additions & 0 deletions docs/design/features/xarch-apx.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# APX Integration in .NET

Let's keep documentation on APX integration and notes on things here. I will evolve this as necessary.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class AsmOffsets

#if TARGET_AMD64
#if TARGET_UNIX
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0xc20;
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0xca0;
#else // TARGET_UNIX
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x4d0;
#endif // TARGET_UNIx
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/vxsort/isa_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,4 @@ void InitSupportedInstructionSet (int32_t configSetting)
if (!((int)s_supportedISA & (int)SupportedISA::AVX2))
s_supportedISA = SupportedISA::None;
s_initialized = true;
}
}
158 changes: 88 additions & 70 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,41 +78,43 @@ enum CORINFO_InstructionSet
InstructionSet_AVX10v1=33,
InstructionSet_AVX10v1_V256=34,
InstructionSet_AVX10v1_V512=35,
InstructionSet_VectorT128=36,
InstructionSet_VectorT256=37,
InstructionSet_VectorT512=38,
InstructionSet_X86Base_X64=39,
InstructionSet_SSE_X64=40,
InstructionSet_SSE2_X64=41,
InstructionSet_SSE3_X64=42,
InstructionSet_SSSE3_X64=43,
InstructionSet_SSE41_X64=44,
InstructionSet_SSE42_X64=45,
InstructionSet_AVX_X64=46,
InstructionSet_AVX2_X64=47,
InstructionSet_AES_X64=48,
InstructionSet_BMI1_X64=49,
InstructionSet_BMI2_X64=50,
InstructionSet_FMA_X64=51,
InstructionSet_LZCNT_X64=52,
InstructionSet_PCLMULQDQ_X64=53,
InstructionSet_POPCNT_X64=54,
InstructionSet_AVXVNNI_X64=55,
InstructionSet_MOVBE_X64=56,
InstructionSet_X86Serialize_X64=57,
InstructionSet_AVX512F_X64=58,
InstructionSet_AVX512F_VL_X64=59,
InstructionSet_AVX512BW_X64=60,
InstructionSet_AVX512BW_VL_X64=61,
InstructionSet_AVX512CD_X64=62,
InstructionSet_AVX512CD_VL_X64=63,
InstructionSet_AVX512DQ_X64=64,
InstructionSet_AVX512DQ_VL_X64=65,
InstructionSet_AVX512VBMI_X64=66,
InstructionSet_AVX512VBMI_VL_X64=67,
InstructionSet_AVX10v1_X64=68,
InstructionSet_AVX10v1_V256_X64=69,
InstructionSet_AVX10v1_V512_X64=70,
InstructionSet_APX=36,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does APX support 32bit at all?

Intel®Advanced Performance Extensions (Intel® APX) expands the Intel® 64 instruction set

or is everything just generated with both x86 and x64?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for bringing this up,

No, APX features is available only under 64-bit mode
Reference: https://www.intel.com/content/www/us/en/content-details/819797/intel-advanced-performance-extensions-intel-apx-architecture-specification.html

3.1.4 Intel® APX features are only available in IA-32e 64-bit Protected Mode, and are an XSAVE-enabled feature
which requires XCR0 enabling before using the new Intel® APX ISA, new Intel® APX prefixes (REX2) and
prefix extensions (EVEX extensions). See section 3.1.4.2 for details on XCR0-enabling for Intel® APX

Based on this fact, I understand APX could be different from existing ISAs and we are open to take suggestions on this part.

InstructionSet_VectorT128=37,
InstructionSet_VectorT256=38,
InstructionSet_VectorT512=39,
InstructionSet_X86Base_X64=40,
InstructionSet_SSE_X64=41,
InstructionSet_SSE2_X64=42,
InstructionSet_SSE3_X64=43,
InstructionSet_SSSE3_X64=44,
InstructionSet_SSE41_X64=45,
InstructionSet_SSE42_X64=46,
InstructionSet_AVX_X64=47,
InstructionSet_AVX2_X64=48,
InstructionSet_AES_X64=49,
InstructionSet_BMI1_X64=50,
InstructionSet_BMI2_X64=51,
InstructionSet_FMA_X64=52,
InstructionSet_LZCNT_X64=53,
InstructionSet_PCLMULQDQ_X64=54,
InstructionSet_POPCNT_X64=55,
InstructionSet_AVXVNNI_X64=56,
InstructionSet_MOVBE_X64=57,
InstructionSet_X86Serialize_X64=58,
InstructionSet_AVX512F_X64=59,
InstructionSet_AVX512F_VL_X64=60,
InstructionSet_AVX512BW_X64=61,
InstructionSet_AVX512BW_VL_X64=62,
InstructionSet_AVX512CD_X64=63,
InstructionSet_AVX512CD_VL_X64=64,
InstructionSet_AVX512DQ_X64=65,
InstructionSet_AVX512DQ_VL_X64=66,
InstructionSet_AVX512VBMI_X64=67,
InstructionSet_AVX512VBMI_VL_X64=68,
InstructionSet_AVX10v1_X64=69,
InstructionSet_AVX10v1_V256_X64=70,
InstructionSet_AVX10v1_V512_X64=71,
InstructionSet_APX_X64=72,
#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
Expand Down Expand Up @@ -150,41 +152,43 @@ enum CORINFO_InstructionSet
InstructionSet_AVX10v1=33,
InstructionSet_AVX10v1_V256=34,
InstructionSet_AVX10v1_V512=35,
InstructionSet_VectorT128=36,
InstructionSet_VectorT256=37,
InstructionSet_VectorT512=38,
InstructionSet_X86Base_X64=39,
InstructionSet_SSE_X64=40,
InstructionSet_SSE2_X64=41,
InstructionSet_SSE3_X64=42,
InstructionSet_SSSE3_X64=43,
InstructionSet_SSE41_X64=44,
InstructionSet_SSE42_X64=45,
InstructionSet_AVX_X64=46,
InstructionSet_AVX2_X64=47,
InstructionSet_AES_X64=48,
InstructionSet_BMI1_X64=49,
InstructionSet_BMI2_X64=50,
InstructionSet_FMA_X64=51,
InstructionSet_LZCNT_X64=52,
InstructionSet_PCLMULQDQ_X64=53,
InstructionSet_POPCNT_X64=54,
InstructionSet_AVXVNNI_X64=55,
InstructionSet_MOVBE_X64=56,
InstructionSet_X86Serialize_X64=57,
InstructionSet_AVX512F_X64=58,
InstructionSet_AVX512F_VL_X64=59,
InstructionSet_AVX512BW_X64=60,
InstructionSet_AVX512BW_VL_X64=61,
InstructionSet_AVX512CD_X64=62,
InstructionSet_AVX512CD_VL_X64=63,
InstructionSet_AVX512DQ_X64=64,
InstructionSet_AVX512DQ_VL_X64=65,
InstructionSet_AVX512VBMI_X64=66,
InstructionSet_AVX512VBMI_VL_X64=67,
InstructionSet_AVX10v1_X64=68,
InstructionSet_AVX10v1_V256_X64=69,
InstructionSet_AVX10v1_V512_X64=70,
InstructionSet_APX=36,
InstructionSet_VectorT128=37,
InstructionSet_VectorT256=38,
InstructionSet_VectorT512=39,
InstructionSet_X86Base_X64=40,
InstructionSet_SSE_X64=41,
InstructionSet_SSE2_X64=42,
InstructionSet_SSE3_X64=43,
InstructionSet_SSSE3_X64=44,
InstructionSet_SSE41_X64=45,
InstructionSet_SSE42_X64=46,
InstructionSet_AVX_X64=47,
InstructionSet_AVX2_X64=48,
InstructionSet_AES_X64=49,
InstructionSet_BMI1_X64=50,
InstructionSet_BMI2_X64=51,
InstructionSet_FMA_X64=52,
InstructionSet_LZCNT_X64=53,
InstructionSet_PCLMULQDQ_X64=54,
InstructionSet_POPCNT_X64=55,
InstructionSet_AVXVNNI_X64=56,
InstructionSet_MOVBE_X64=57,
InstructionSet_X86Serialize_X64=58,
InstructionSet_AVX512F_X64=59,
InstructionSet_AVX512F_VL_X64=60,
InstructionSet_AVX512BW_X64=61,
InstructionSet_AVX512BW_VL_X64=62,
InstructionSet_AVX512CD_X64=63,
InstructionSet_AVX512CD_VL_X64=64,
InstructionSet_AVX512DQ_X64=65,
InstructionSet_AVX512DQ_VL_X64=66,
InstructionSet_AVX512VBMI_X64=67,
InstructionSet_AVX512VBMI_VL_X64=68,
InstructionSet_AVX10v1_X64=69,
InstructionSet_AVX10v1_V256_X64=70,
InstructionSet_AVX10v1_V512_X64=71,
InstructionSet_APX_X64=72,
#endif // TARGET_X86

};
Expand Down Expand Up @@ -364,6 +368,8 @@ struct CORINFO_InstructionSetFlags
AddInstructionSet(InstructionSet_AVX10v1_V256_X64);
if (HasInstructionSet(InstructionSet_AVX10v1_V512))
AddInstructionSet(InstructionSet_AVX10v1_V512_X64);
if (HasInstructionSet(InstructionSet_APX))
AddInstructionSet(InstructionSet_APX_X64);
#endif // TARGET_AMD64
#ifdef TARGET_X86
#endif // TARGET_X86
Expand Down Expand Up @@ -572,6 +578,10 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVX10v1_V512);
if (resultflags.HasInstructionSet(InstructionSet_AVX10v1_V512_X64) && !resultflags.HasInstructionSet(InstructionSet_AVX10v1_V512))
resultflags.RemoveInstructionSet(InstructionSet_AVX10v1_V512_X64);
if (resultflags.HasInstructionSet(InstructionSet_APX) && !resultflags.HasInstructionSet(InstructionSet_APX_X64))
resultflags.RemoveInstructionSet(InstructionSet_APX);
if (resultflags.HasInstructionSet(InstructionSet_APX_X64) && !resultflags.HasInstructionSet(InstructionSet_APX))
resultflags.RemoveInstructionSet(InstructionSet_APX_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE);
if (resultflags.HasInstructionSet(InstructionSet_SSE2) && !resultflags.HasInstructionSet(InstructionSet_SSE))
Expand Down Expand Up @@ -1000,6 +1010,10 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "AVX10v1_V512";
case InstructionSet_AVX10v1_V512_X64 :
return "AVX10v1_V512_X64";
case InstructionSet_APX :
return "APX";
case InstructionSet_APX_X64 :
return "APX_X64";
case InstructionSet_VectorT128 :
return "VectorT128";
case InstructionSet_VectorT256 :
Expand Down Expand Up @@ -1078,6 +1092,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "AVX10v1_V256";
case InstructionSet_AVX10v1_V512 :
return "AVX10v1_V512";
case InstructionSet_APX :
return "APX";
case InstructionSet_VectorT128 :
return "VectorT128";
case InstructionSet_VectorT256 :
Expand Down Expand Up @@ -1151,6 +1167,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Avx10v1: return InstructionSet_AVX10v1;
case READYTORUN_INSTRUCTION_Avx10v1_V256: return InstructionSet_AVX10v1_V256;
case READYTORUN_INSTRUCTION_Avx10v1_V512: return InstructionSet_AVX10v1_V512;
case READYTORUN_INSTRUCTION_Apx: return InstructionSet_APX;
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_VectorT256: return InstructionSet_VectorT256;
case READYTORUN_INSTRUCTION_VectorT512: return InstructionSet_VectorT512;
Expand Down Expand Up @@ -1188,6 +1205,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Avx10v1: return InstructionSet_AVX10v1;
case READYTORUN_INSTRUCTION_Avx10v1_V256: return InstructionSet_AVX10v1_V256;
case READYTORUN_INSTRUCTION_Avx10v1_V512: return InstructionSet_AVX10v1_V512;
case READYTORUN_INSTRUCTION_Apx: return InstructionSet_APX;
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_VectorT256: return InstructionSet_VectorT256;
case READYTORUN_INSTRUCTION_VectorT512: return InstructionSet_VectorT512;
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* 227e46fa-1be3-4770-b613-4a239e7c28aa */
0x227e46fa,
0x1be3,
0x4770,
{0xb6, 0x13, 0x4a, 0x23, 0x9e, 0x7c, 0x28, 0xaa}
constexpr GUID JITEEVersionIdentifier = { /* deed5db4-371c-4b2d-904d-9cd39cb48764 */
0xdeed5db4,
0x371c,
0x4b2d,
{0x90, 0x4d, 0x9c, 0xd3, 0x9c, 0xb4, 0x87, 0x64}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_Avx10v1=44,
READYTORUN_INSTRUCTION_Avx10v1_V256=45,
READYTORUN_INSTRUCTION_Avx10v1_V512=46,
READYTORUN_INSTRUCTION_Apx=47,

};

Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,14 +482,14 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
// Determine if the processor supports AVX or AVX512 so we could
// retrieve extended registers
DWORD64 FeatureMask = GetEnabledXStateFeatures();
if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0)
if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | /*XSATE_MASK_APX*/(0x80000))) != 0)
{
context = context | CONTEXT_XSTATE;
}

// Retrieve contextSize by passing NULL for Buffer
DWORD contextSize = 0;
ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512;
ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512 | /*XSATE_MASK_APX*/(0x80000);
// The initialize call should fail but return contextSize
BOOL success = pfnInitializeContext2 ?
pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) :
Expand Down Expand Up @@ -540,9 +540,9 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont
#if defined(TARGET_X86) || defined(TARGET_AMD64)
// Make sure that AVX feature mask is set, if supported. This should not normally fail.
// The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor.
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512))
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | /*XSATE_MASK_APX*/(0x80000)))
{
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512");
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | /*XSATE_MASK_APX*/(0x80000)");
return FALSE;
}
#endif //defined(TARGET_X86) || defined(TARGET_AMD64)
Expand Down
25 changes: 25 additions & 0 deletions src/coreclr/pal/inc/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,10 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS {
(UI64(1) << (XSTATE_AVX512_ZMM_H)) | \
(UI64(1) << (XSTATE_AVX512_ZMM)))

// TODO-xarch-apx: the definition of XSTATE mask value for APX is now missing on the OS level,
// we are currently using bare value to hack it through the build process, and test the implementation through CI.
// those changes will be removed when we have the OS support for APX.

typedef struct DECLSPEC_ALIGN(16) _M128A {
ULONGLONG Low;
LONGLONG High;
Expand Down Expand Up @@ -1640,6 +1644,27 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
M512 Zmm30;
M512 Zmm31;
};

struct
{
DWORD64 Egpr16;
DWORD64 Egpr17;
DWORD64 Egpr18;
DWORD64 Egpr19;
DWORD64 Egpr20;
DWORD64 Egpr21;
DWORD64 Egpr22;
DWORD64 Egpr23;
DWORD64 Egpr24;
DWORD64 Egpr25;
DWORD64 Egpr26;
DWORD64 Egpr27;
DWORD64 Egpr28;
DWORD64 Egpr29;
DWORD64 Egpr30;
DWORD64 Egpr31;
};

} CONTEXT, *PCONTEXT, *LPCONTEXT;

//
Expand Down
7 changes: 6 additions & 1 deletion src/coreclr/pal/src/arch/amd64/asmconstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
(1 << (XSTATE_AVX512_ZMM_H)) | \
(1 << (XSTATE_AVX512_ZMM)))

// TODO-xarch-apx: the definition of XSTATE mask value for APX is now missing on the OS level,
// we are currently using bare value to hack it through the build process, and test the implementation through CI.
// those changes will be removed when we have the OS support for APX.

// The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not
// relevant, the arch bit is excluded from the flag constants below for simpler tests.
#define CONTEXT_AMD64 0x100000
Expand Down Expand Up @@ -91,7 +95,8 @@
#define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16)
#define CONTEXT_Zmm0H CONTEXT_KMask0+(8*8)
#define CONTEXT_Zmm16 CONTEXT_Zmm0H+(32*16)
#define CONTEXT_Size CONTEXT_Zmm16+(64*16)
#define CONTEXT_Egpr CONTEXT_Zmm16+(16*8)
#define CONTEXT_Size CONTEXT_Egpr+(8*16)

#else // HOST_64BIT

Expand Down
23 changes: 23 additions & 0 deletions src/coreclr/pal/src/arch/amd64/context2.S
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,29 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT):
kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)]
kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)]

// TODO-xarch-apx: the definition of XSTATE mask value for APX is now missing on the OS level,
// we are currently using bare value to hack it through the build process, and test the implementation through CI.
// those changes will be removed when we have the OS support for APX.
test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], 524288
je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE)

// Restore the EGPR state, EGPR use previous MPX field, need to add an offset.
mov r16, qword ptr [rdi + CONTEXT_Egpr + 0 * 8]
mov r17, qword ptr [rdi + CONTEXT_Egpr + 1 * 8]
mov r18, qword ptr [rdi + CONTEXT_Egpr + 2 * 8]
mov r19, qword ptr [rdi + CONTEXT_Egpr + 3 * 8]
mov r20, qword ptr [rdi + CONTEXT_Egpr + 4 * 8]
mov r21, qword ptr [rdi + CONTEXT_Egpr + 5 * 8]
mov r22, qword ptr [rdi + CONTEXT_Egpr + 6 * 8]
mov r23, qword ptr [rdi + CONTEXT_Egpr + 7 * 8]
mov r24, qword ptr [rdi + CONTEXT_Egpr + 8 * 8]
mov r25, qword ptr [rdi + CONTEXT_Egpr + 9 * 8]
mov r26, qword ptr [rdi + CONTEXT_Egpr + 10 * 8]
mov r27, qword ptr [rdi + CONTEXT_Egpr + 11 * 8]
mov r28, qword ptr [rdi + CONTEXT_Egpr + 12 * 8]
mov r29, qword ptr [rdi + CONTEXT_Egpr + 13 * 8]
mov r30, qword ptr [rdi + CONTEXT_Egpr + 14 * 8]
mov r31, qword ptr [rdi + CONTEXT_Egpr + 15 * 8]
LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE):

test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_CONTROL
Expand Down
Loading
Loading