-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARM64-SVE: Add SVE registers to pal context #103801
Changes from 1 commit
72a681f
5e19a3e
b38dacd
5fc68cc
41580bc
6c8a283
7d08124
5f918a9
7cff0c1
3f287c5
e9e6a4e
a2c17dd
7c3256b
ba17c2b
73404ff
ea6979a
dd12f03
29acc33
a21aee0
2e549cd
d43f5d5
ed15cc3
063f41b
8be931b
6fb9141
d2c2e10
f0a1dba
178e266
29933a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -708,7 +708,9 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) | |
} | ||
} | ||
#else // TARGET_OSX | ||
fpsimd_context* fp = GetNativeSigSimdContext(native); | ||
fpsimd_context* fp = nullptr; | ||
sve_context* sve = nullptr; | ||
GetNativeSigSimdContext(native, &fp, &sve); | ||
if (fp) | ||
{ | ||
fp->fpsr = lpContext->Fpsr; | ||
|
@@ -718,6 +720,25 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) | |
*(NEON128*) &fp->vregs[i] = lpContext->V[i]; | ||
} | ||
} | ||
if (sve) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to x64, we should copy the state only if the contextFlags has the CONTEXT_XSTATE flag set. The passed in contextFlags list parts of the state that are valid It seems it would make sense to move this to the end of the function next to where we extract xstate for amd64 and put it under the same if ((contextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There is one remaining test failure I've just debugged to being due to this. Will fix it up. |
||
{ | ||
//TODO-SVE: This only handles vector lengths of 128bits. | ||
|
||
uint16_t vq = sve_vq_from_vl(lpContext->Vl); | ||
|
||
sve->vl = lpContext->Vl; | ||
|
||
//Note: Size of ffr register is SVE_SIG_FFR_SIZE(vq) bytes. | ||
*(WORD*) (((uint8_t*)sve) + SVE_SIG_FFR_OFFSET(vq)) = lpContext->Ffr; | ||
|
||
for (int i = 0; i < 32; i++) | ||
{ | ||
//Note: Size of a Z register is SVE_SIG_ZREGS_SIZE(vq) bytes. | ||
*(SVE128*) (((uint8_t*)sve) + SVE_SIG_ZREG_OFFSET(vq, i)) = lpContext->Z[i]; | ||
//Note: Size of a P register is SVE_SIG_PREGS_SIZE(vq) bytes. | ||
*(WORD*) (((uint8_t*)sve) + SVE_SIG_PREG_OFFSET(vq, i)) = lpContext->P[i]; | ||
} | ||
} | ||
#endif // TARGET_OSX | ||
#elif defined(HOST_ARM) | ||
VfpSigFrame* fp = GetNativeSigSimdContext(native); | ||
|
@@ -805,6 +826,99 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native) | |
#endif //HOST_AMD64 && XSTATE_SUPPORTED | ||
} | ||
|
||
#if defined(HOST_64BIT) && defined(HOST_ARM64) && !defined(TARGET_FREEBSD) && !defined(TARGET_OSX) | ||
/*++ | ||
Function : | ||
_GetNativeSigSimdContext | ||
|
||
Finds the FP and SVE context from the reserved data section of a native context. | ||
|
||
Parameters : | ||
uint8_t *data : native context reserved data. | ||
uint32_t size : size of the reserved data. | ||
fpsimd_context **fp_ptr : returns a pointer to the FP context. | ||
sve_context **sve_ptr : returns a pointer to the SVE context. | ||
|
||
Return value : | ||
None. | ||
|
||
--*/ | ||
void _GetNativeSigSimdContext(uint8_t *data, uint32_t size, fpsimd_context **fp_ptr, sve_context **sve_ptr) | ||
{ | ||
size_t position = 0; | ||
fpsimd_context *fp = nullptr; | ||
sve_context *sve = nullptr; | ||
extra_context *extra = nullptr; | ||
bool done = false; | ||
|
||
while (!done) | ||
{ | ||
_aarch64_ctx *ctx = reinterpret_cast<_aarch64_ctx *>(&data[position]); | ||
|
||
_ASSERTE(position + ctx->size <= size); | ||
|
||
|
||
switch (ctx->magic) | ||
{ | ||
case FPSIMD_MAGIC: | ||
_ASSERTE(fp == nullptr); | ||
_ASSERTE(ctx->size >= sizeof(fpsimd_context)); | ||
fp = reinterpret_cast<fpsimd_context *>(&data[position]); | ||
break; | ||
|
||
case SVE_MAGIC: | ||
_ASSERTE(sve == nullptr); | ||
_ASSERTE(ctx->size >= sizeof(sve_context)); | ||
sve = reinterpret_cast<sve_context *>(&data[position]); | ||
break; | ||
|
||
case EXTRA_MAGIC: | ||
{ | ||
// Points to an additional section of reserved data. | ||
_ASSERTE(extra == nullptr); | ||
_ASSERTE(ctx->size >= sizeof(extra_context)); | ||
fpsimd_context *fpOrig = fp; | ||
sve_context *sveOrig = sve; | ||
|
||
extra = reinterpret_cast<extra_context *>(&data[position]); | ||
_GetNativeSigSimdContext((uint8_t*)extra->datap, extra->size, &fp, &sve); | ||
|
||
// There should only be one block of each type. | ||
_ASSERTE(fpOrig == nullptr || fp == fpOrig); | ||
_ASSERTE(sveOrig == nullptr || sve == sveOrig); | ||
break; | ||
} | ||
|
||
case 0: | ||
_ASSERTE(ctx->size == 0); | ||
done = true; | ||
break; | ||
|
||
default: | ||
// Any other section. | ||
_ASSERTE(ctx->size != 0); | ||
break; | ||
} | ||
|
||
position += ctx->size; | ||
} | ||
|
||
if (fp) | ||
{ | ||
*fp_ptr = fp; | ||
} | ||
if (sve) | ||
{ | ||
// If this ever fires then we have an SVE context but no FP context. Given that V and Z | ||
// registers overlap, then when propagating this data to other structures, the SVE | ||
// context should be used to fill the FP data. | ||
_ASSERTE(fp != nullptr); | ||
|
||
*sve_ptr = sve; | ||
} | ||
} | ||
#endif // HOST_64BIT && HOST_ARM64 && !TARGET_FREEBSD && !TARGET_OSX | ||
|
||
/*++ | ||
Function : | ||
CONTEXTFromNativeContext | ||
|
@@ -917,7 +1031,9 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex | |
} | ||
} | ||
#else // TARGET_OSX | ||
const fpsimd_context* fp = GetConstNativeSigSimdContext(native); | ||
const fpsimd_context* fp = nullptr; | ||
const sve_context* sve = nullptr; | ||
GetConstNativeSigSimdContext(native, &fp, &sve); | ||
if (fp) | ||
{ | ||
lpContext->Fpsr = fp->fpsr; | ||
|
@@ -927,6 +1043,25 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex | |
lpContext->V[i] = *(NEON128*) &fp->vregs[i]; | ||
} | ||
} | ||
if (sve) | ||
{ | ||
//TODO-SVE: This only handles vector lengths of 128bits. | ||
|
||
uint16_t vq = sve_vq_from_vl(sve->vl); | ||
|
||
lpContext->Vl = sve->vl; | ||
|
||
//Note: Size of ffr register is SVE_SIG_FFR_SIZE(vq) bytes. | ||
lpContext->Ffr = *(WORD*) (((uint8_t*)sve) + SVE_SIG_FFR_OFFSET(vq)); | ||
|
||
for (int i = 0; i < 32; i++) | ||
{ | ||
//Note: Size of a Z register is SVE_SIG_ZREGS_SIZE(vq) bytes. | ||
lpContext->Z[i] = *(SVE128*) (((uint8_t*)sve) + SVE_SIG_ZREG_OFFSET(vq, i)); | ||
//Note: Size of a P register is SVE_SIG_PREGS_SIZE(vq) bytes. | ||
lpContext->P[i] = *(WORD*) (((uint8_t*)sve) + SVE_SIG_PREG_OFFSET(vq, i)); | ||
} | ||
} | ||
#endif // TARGET_OSX | ||
#elif defined(HOST_ARM) | ||
const VfpSigFrame* fp = GetConstNativeSigSimdContext(native); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AIUI, this should match the same structure in Windows. I don't have the any documentation, so I've made a guess at what the fields should be for SVE, and I expect that it's wrong
For convenience I've only used a vector length 128bits. I'd be surprised if windows supports a full 2048bit vector length without doing anything special.
(Offsets below marked with a
?
I'll fix once the structure is correct)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Windows in general don't store extended context parts in the CONTEXT data structure itself. There is a flag CONTEXT_XSTATE that indicates presence of extra data attached to the CONTEXT. There are APIs InitializeContext and InitializeContext2 that allows setting up a context for the extended state. It can be also used to get the size of memory needed for the extended context. The InitializeContext2 is a new one that allows to select only a subset of the extended state using the XStateCompactionMask argument.
We have done this differently for AVX512 for the sake of simplicity - we have included the extra registers in the CONTEXT structure itself. I think it would be better to move that to the way Windows handle that so that we don't waste time initializing and copying extra fields at places where we don't care about the extended state or when the current CPU doesn't support them. That would also allow to size the storage for the Z/P registers dynamically based on the current CPU.
Having said that though, for this PR, we can follow the suite and do the same thing we did for intel avx512 and migrate both to the better model later. Based on what @kunalspathak told me, starting with 128 bits of space for the registers should be sufficient for now.
I would add them to the very end of the CONTEXT after the debug registers so that the layout of the part that's common with Windows is the same.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, for the OS, extended context like SVE and AVX etc are stored in a variable-sized buffer separate from the CONTEXT. The CONTEXT_EX structure immediately follows the CONTEXT structure, and contains pointers to the variable-sized XSTATE buffer. On x64, the XSTATE buffer is in the exact format that is supported by the hardware via the XSAVE and XRSTOR instructions. On ARM64, there are no XSAVE/XRSTOR instructions, but the XSTATE buffer is laid out in a similar fashion to x64 (including Header->Mask, Header->CompationMask etc), to allow for max code sharing with x64.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The OS kernel does support all SVE vector lengths, up to 2048-bit SVE, though there is the caveat that HyperV only supports 128-bit SVE. So, when running on hardware that supports SVE larger than 128-bits, if HyperV is enabled you'll only see 128-bit SVE, but if HyperV is off then you'll be able to take advantage of the full SVE width supported by the CPU. And to my understanding, there is likely hardware in the future that supports larger SVE lengths than 128-bit, though I don't know any specific on timelines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the comments. Updated with the following:
I'm currently unsure where else SVE state might need saving/restoring