Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM64-SVE: Add SVE registers to pal context #103801

Merged
merged 29 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
72a681f
ARM64-SVE: Add SVE registers to pal context
a74nh Jun 19, 2024
5e19a3e
fix debug sizes
a74nh Jun 21, 2024
b38dacd
Add SVE defines if missing from Linux host
a74nh Jun 21, 2024
5fc68cc
More missing defines
a74nh Jun 21, 2024
41580bc
More missing defines
a74nh Jun 21, 2024
6c8a283
Add cast
a74nh Jun 21, 2024
7d08124
Move SVE registers after debug registers
a74nh Jun 24, 2024
5f918a9
Fix dbgtargetcontext
a74nh Jun 24, 2024
7cff0c1
Remove SVE from debug context
a74nh Jun 24, 2024
3f287c5
Move ffr
a74nh Jun 24, 2024
e9e6a4e
Add SVE registers to asmconstants
a74nh Jun 24, 2024
a2c17dd
Remove Z registers from context
a74nh Jun 24, 2024
7c3256b
backup/restore SVE in Context2.S
a74nh Jun 25, 2024
ba17c2b
Remove unused SVE128 struct
a74nh Jun 25, 2024
73404ff
Add XStateFeaturesMask
a74nh Jun 25, 2024
ea6979a
restore instrsarm64sve.h changes
a74nh Jun 25, 2024
dd12f03
Restore SIZEOF__CONTEXT for windows
a74nh Jun 25, 2024
29acc33
Fix AsmOffsets.cs for windows
a74nh Jun 25, 2024
a21aee0
Fix AsmOffsets.cs for windows
a74nh Jun 25, 2024
2e549cd
Restore missing ldr
a74nh Jun 26, 2024
d43f5d5
Check size of SVE data returned from the kernel
a74nh Jun 27, 2024
ed15cc3
16 P registers
a74nh Jun 27, 2024
063f41b
Copy context based on XSTATE_MASK_SVE
a74nh Jun 28, 2024
8be931b
Move context handling inside XSTATE checks
a74nh Jun 28, 2024
6fb9141
Set CONTEXT_XSTATE
a74nh Jun 28, 2024
d2c2e10
Remove __pad and fix sizes
a74nh Jun 28, 2024
f0a1dba
Fix context sizes
a74nh Jun 28, 2024
178e266
Fix context sizes
a74nh Jun 28, 2024
29933a8
Only read/write OS context SVE registers on 128bit
a74nh Jun 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ class AsmOffsets
public const int OFFSETOF__REGDISPLAY__ControlPC = 0xbe0;
#endif // TARGET_UNIX
#elif TARGET_ARM64
public const int SIZEOF__REGDISPLAY = 0x940;
public const int OFFSETOF__REGDISPLAY__SP = 0x898;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0x8a0;
public const int SIZEOF__REGDISPLAY = 0xde0;
public const int OFFSETOF__REGDISPLAY__SP = 0xd38;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0xd40;
#elif TARGET_ARM
public const int SIZEOF__REGDISPLAY = 0x410;
public const int OFFSETOF__REGDISPLAY__SP = 0x3ec;
Expand Down Expand Up @@ -113,7 +113,7 @@ class AsmOffsets
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x4d0;
#endif // TARGET_UNIx
#elif TARGET_ARM64
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x390;
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x5e0;
#elif TARGET_ARM
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x1a0;
#elif TARGET_X86
Expand Down
25 changes: 20 additions & 5 deletions src/coreclr/debug/inc/dbgtargetcontext.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,12 @@ typedef struct {
LONGLONG High;
} DT_NEON128;

typedef struct {
ULONGLONG Low;
LONGLONG High;
} DT_SVE128;


typedef DECLSPEC_ALIGN(16) struct {
//
// Control flags.
Expand Down Expand Up @@ -458,15 +464,24 @@ typedef DECLSPEC_ALIGN(16) struct {
/* +0x310 */ DWORD Fpcr;
/* +0x314 */ DWORD Fpsr;

//
// Sve Registers
//
//TODO-SVE: How does this structure handle variable sized Z/P/FFR registers?
/* +0x318 */ DWORD Vl;
/* +0x32C */ DT_SVE128 Z[32];
/* +0x? */ WORD P[32];
/* +0x? */ WORD Ffr;

//
// Debug registers
//

/* +0x318 */ DWORD Bcr[DT_ARM64_MAX_BREAKPOINTS];
/* +0x338 */ DWORD64 Bvr[DT_ARM64_MAX_BREAKPOINTS];
/* +0x378 */ DWORD Wcr[DT_ARM64_MAX_WATCHPOINTS];
/* +0x380 */ DWORD64 Wvr[DT_ARM64_MAX_WATCHPOINTS];
/* +0x390 */
/* +0x? */ DWORD Bcr[DT_ARM64_MAX_BREAKPOINTS];
/* +0x? */ DWORD64 Bvr[DT_ARM64_MAX_BREAKPOINTS];
/* +0x? */ DWORD Wcr[DT_ARM64_MAX_WATCHPOINTS];
/* +0x? */ DWORD64 Wvr[DT_ARM64_MAX_WATCHPOINTS];
/* +0x5e0 */

} DT_CONTEXT;

Expand Down
24 changes: 19 additions & 5 deletions src/coreclr/pal/inc/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1876,6 +1876,11 @@ typedef struct _NEON128 {
LONGLONG High;
} NEON128, *PNEON128;

typedef struct _SVE128 {
ULONGLONG Low;
LONGLONG High;
} SVE128, *PSVE128;

typedef struct DECLSPEC_ALIGN(16) _CONTEXT {

//
Expand Down Expand Up @@ -1936,15 +1941,24 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
/* +0x310 */ DWORD Fpcr;
/* +0x314 */ DWORD Fpsr;

//
// Sve Registers
//
//TODO-SVE: How does this structure handle variable sized Z/P/FFR registers?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AIUI, this should match the same structure in Windows. I don't have the any documentation, so I've made a guess at what the fields should be for SVE, and I expect that it's wrong
For convenience I've only used a vector length 128bits. I'd be surprised if windows supports a full 2048bit vector length without doing anything special.
(Offsets below marked with a ? I'll fix once the structure is correct)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows in general don't store extended context parts in the CONTEXT data structure itself. There is a flag CONTEXT_XSTATE that indicates presence of extra data attached to the CONTEXT. There are APIs InitializeContext and InitializeContext2 that allows setting up a context for the extended state. It can be also used to get the size of memory needed for the extended context. The InitializeContext2 is a new one that allows to select only a subset of the extended state using the XStateCompactionMask argument.

We have done this differently for AVX512 for the sake of simplicity - we have included the extra registers in the CONTEXT structure itself. I think it would be better to move that to the way Windows handle that so that we don't waste time initializing and copying extra fields at places where we don't care about the extended state or when the current CPU doesn't support them. That would also allow to size the storage for the Z/P registers dynamically based on the current CPU.
Having said that though, for this PR, we can follow the suite and do the same thing we did for intel avx512 and migrate both to the better model later. Based on what @kunalspathak told me, starting with 128 bits of space for the registers should be sufficient for now.

I would add them to the very end of the CONTEXT after the debug registers so that the layout of the part that's common with Windows is the same.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, for the OS, extended context like SVE and AVX etc are stored in a variable-sized buffer separate from the CONTEXT. The CONTEXT_EX structure immediately follows the CONTEXT structure, and contains pointers to the variable-sized XSTATE buffer. On x64, the XSTATE buffer is in the exact format that is supported by the hardware via the XSAVE and XRSTOR instructions. On ARM64, there are no XSAVE/XRSTOR instructions, but the XSTATE buffer is laid out in a similar fashion to x64 (including Header->Mask, Header->CompationMask etc), to allow for max code sharing with x64.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The OS kernel does support all SVE vector lengths, up to 2048-bit SVE, though there is the caveat that HyperV only supports 128-bit SVE. So, when running on hardware that supports SVE larger than 128-bits, if HyperV is enabled you'll only see 128-bit SVE, but if HyperV is off then you'll be able to take advantage of the full SVE width supported by the CPU. And to my understanding, there is likely hardware in the future that supports larger SVE lengths than 128-bit, though I don't know any specific on timelines.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the comments. Updated with the following:

  • Added context2.S changes
  • Removed store/restore of Z registers (as we only support 128bits for now, which fully overlap the V registers)
  • Added a XStateFeaturesMask to the Arm64 context so that we can tell whether to use SVE or not.

I'm currently unsure where else SVE state might need saving/restoring

/* +0x318 */ DWORD Vl;
/* +0x32C */ SVE128 Z[32];
/* +0x32C */ WORD P[32];
/* +0x32C */ WORD Ffr;

//
// Debug registers
//

/* +0x318 */ DWORD Bcr[ARM64_MAX_BREAKPOINTS];
/* +0x338 */ DWORD64 Bvr[ARM64_MAX_BREAKPOINTS];
/* +0x378 */ DWORD Wcr[ARM64_MAX_WATCHPOINTS];
/* +0x380 */ DWORD64 Wvr[ARM64_MAX_WATCHPOINTS];
/* +0x390 */
/* +0x? */ DWORD Bcr[ARM64_MAX_BREAKPOINTS];
/* +0x? */ DWORD64 Bvr[ARM64_MAX_BREAKPOINTS];
/* +0x? */ DWORD Wcr[ARM64_MAX_WATCHPOINTS];
/* +0x? */ DWORD64 Wvr[ARM64_MAX_WATCHPOINTS];
/* +0x? */

} CONTEXT, *PCONTEXT, *LPCONTEXT;

Expand Down
33 changes: 5 additions & 28 deletions src/coreclr/pal/src/include/pal/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -662,41 +662,18 @@ const struct fpregs* GetConstNativeSigSimdContext(const native_context_t *mc)
#define MCREG_Pc(mc) ((mc).pc)
#define MCREG_Cpsr(mc) ((mc).pstate)

void _GetNativeSigSimdContext(uint8_t *data, uint32_t size, fpsimd_context **fp_ptr, sve_context **sve_ptr);

inline
fpsimd_context* GetNativeSigSimdContext(native_context_t *mc)
void GetNativeSigSimdContext(native_context_t *mc, fpsimd_context **fp_ptr, sve_context **sve_ptr)
{
size_t size = 0;

do
{
fpsimd_context* fp = reinterpret_cast<fpsimd_context *>(&mc->uc_mcontext.__reserved[size]);

if(fp->head.magic == FPSIMD_MAGIC)
{
_ASSERTE(fp->head.size >= sizeof(fpsimd_context));
_ASSERTE(size + fp->head.size <= sizeof(mc->uc_mcontext.__reserved));

return fp;
}

if (fp->head.size == 0)
{
break;
}

size += fp->head.size;
} while (size + sizeof(fpsimd_context) <= sizeof(mc->uc_mcontext.__reserved));

_ASSERTE(false);

return nullptr;
_GetNativeSigSimdContext(&mc->uc_mcontext.__reserved[0], sizeof(mc->uc_mcontext.__reserved), fp_ptr, sve_ptr);
}

inline
const fpsimd_context* GetConstNativeSigSimdContext(const native_context_t *mc)
void GetConstNativeSigSimdContext(const native_context_t *mc, fpsimd_context const **fp_ptr, sve_context const **sve_ptr)
{
return GetNativeSigSimdContext(const_cast<native_context_t*>(mc));
GetNativeSigSimdContext(const_cast<native_context_t*>(mc), const_cast<fpsimd_context **>(fp_ptr), const_cast<sve_context **>(sve_ptr));
}

#else // TARGET_OSX
Expand Down
139 changes: 137 additions & 2 deletions src/coreclr/pal/src/thread/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,9 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native)
}
}
#else // TARGET_OSX
fpsimd_context* fp = GetNativeSigSimdContext(native);
fpsimd_context* fp = nullptr;
sve_context* sve = nullptr;
GetNativeSigSimdContext(native, &fp, &sve);
if (fp)
{
fp->fpsr = lpContext->Fpsr;
Expand All @@ -718,6 +720,25 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native)
*(NEON128*) &fp->vregs[i] = lpContext->V[i];
}
}
if (sve)
Copy link
Member

@janvorli janvorli Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to x64, we should copy the state only if the contextFlags has the CONTEXT_XSTATE flag set. The passed in contextFlags list parts of the state that are valid that the caller is interested in.

It seems it would make sense to move this to the end of the function next to where we extract xstate for amd64 and put it under the same if ((contextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to x64, we should copy the state only if the contextFlags has the CONTEXT_XSTATE flag set.

There is one remaining test failure I've just debugged to being due to this. Will fix it up.

{
//TODO-SVE: This only handles vector lengths of 128bits.

uint16_t vq = sve_vq_from_vl(lpContext->Vl);

sve->vl = lpContext->Vl;

//Note: Size of ffr register is SVE_SIG_FFR_SIZE(vq) bytes.
*(WORD*) (((uint8_t*)sve) + SVE_SIG_FFR_OFFSET(vq)) = lpContext->Ffr;

for (int i = 0; i < 32; i++)
{
//Note: Size of a Z register is SVE_SIG_ZREGS_SIZE(vq) bytes.
*(SVE128*) (((uint8_t*)sve) + SVE_SIG_ZREG_OFFSET(vq, i)) = lpContext->Z[i];
//Note: Size of a P register is SVE_SIG_PREGS_SIZE(vq) bytes.
*(WORD*) (((uint8_t*)sve) + SVE_SIG_PREG_OFFSET(vq, i)) = lpContext->P[i];
}
}
#endif // TARGET_OSX
#elif defined(HOST_ARM)
VfpSigFrame* fp = GetNativeSigSimdContext(native);
Expand Down Expand Up @@ -805,6 +826,99 @@ void CONTEXTToNativeContext(CONST CONTEXT *lpContext, native_context_t *native)
#endif //HOST_AMD64 && XSTATE_SUPPORTED
}

#if defined(HOST_64BIT) && defined(HOST_ARM64) && !defined(TARGET_FREEBSD) && !defined(TARGET_OSX)
/*++
Function :
_GetNativeSigSimdContext

Finds the FP and SVE context from the reserved data section of a native context.

Parameters :
uint8_t *data : native context reserved data.
uint32_t size : size of the reserved data.
fpsimd_context **fp_ptr : returns a pointer to the FP context.
sve_context **sve_ptr : returns a pointer to the SVE context.

Return value :
None.

--*/
void _GetNativeSigSimdContext(uint8_t *data, uint32_t size, fpsimd_context **fp_ptr, sve_context **sve_ptr)
{
size_t position = 0;
fpsimd_context *fp = nullptr;
sve_context *sve = nullptr;
extra_context *extra = nullptr;
bool done = false;

while (!done)
{
_aarch64_ctx *ctx = reinterpret_cast<_aarch64_ctx *>(&data[position]);

_ASSERTE(position + ctx->size <= size);


switch (ctx->magic)
{
case FPSIMD_MAGIC:
_ASSERTE(fp == nullptr);
_ASSERTE(ctx->size >= sizeof(fpsimd_context));
fp = reinterpret_cast<fpsimd_context *>(&data[position]);
break;

case SVE_MAGIC:
_ASSERTE(sve == nullptr);
_ASSERTE(ctx->size >= sizeof(sve_context));
sve = reinterpret_cast<sve_context *>(&data[position]);
break;

case EXTRA_MAGIC:
{
// Points to an additional section of reserved data.
_ASSERTE(extra == nullptr);
_ASSERTE(ctx->size >= sizeof(extra_context));
fpsimd_context *fpOrig = fp;
sve_context *sveOrig = sve;

extra = reinterpret_cast<extra_context *>(&data[position]);
_GetNativeSigSimdContext((uint8_t*)extra->datap, extra->size, &fp, &sve);

// There should only be one block of each type.
_ASSERTE(fpOrig == nullptr || fp == fpOrig);
_ASSERTE(sveOrig == nullptr || sve == sveOrig);
break;
}

case 0:
_ASSERTE(ctx->size == 0);
done = true;
break;

default:
// Any other section.
_ASSERTE(ctx->size != 0);
break;
}

position += ctx->size;
}

if (fp)
{
*fp_ptr = fp;
}
if (sve)
{
// If this ever fires then we have an SVE context but no FP context. Given that V and Z
// registers overlap, then when propagating this data to other structures, the SVE
// context should be used to fill the FP data.
_ASSERTE(fp != nullptr);

*sve_ptr = sve;
}
}
#endif // HOST_64BIT && HOST_ARM64 && !TARGET_FREEBSD && !TARGET_OSX

/*++
Function :
CONTEXTFromNativeContext
Expand Down Expand Up @@ -917,7 +1031,9 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex
}
}
#else // TARGET_OSX
const fpsimd_context* fp = GetConstNativeSigSimdContext(native);
const fpsimd_context* fp = nullptr;
const sve_context* sve = nullptr;
GetConstNativeSigSimdContext(native, &fp, &sve);
if (fp)
{
lpContext->Fpsr = fp->fpsr;
Expand All @@ -927,6 +1043,25 @@ void CONTEXTFromNativeContext(const native_context_t *native, LPCONTEXT lpContex
lpContext->V[i] = *(NEON128*) &fp->vregs[i];
}
}
if (sve)
{
//TODO-SVE: This only handles vector lengths of 128bits.

uint16_t vq = sve_vq_from_vl(sve->vl);

lpContext->Vl = sve->vl;

//Note: Size of ffr register is SVE_SIG_FFR_SIZE(vq) bytes.
lpContext->Ffr = *(WORD*) (((uint8_t*)sve) + SVE_SIG_FFR_OFFSET(vq));

for (int i = 0; i < 32; i++)
{
//Note: Size of a Z register is SVE_SIG_ZREGS_SIZE(vq) bytes.
lpContext->Z[i] = *(SVE128*) (((uint8_t*)sve) + SVE_SIG_ZREG_OFFSET(vq, i));
//Note: Size of a P register is SVE_SIG_PREGS_SIZE(vq) bytes.
lpContext->P[i] = *(WORD*) (((uint8_t*)sve) + SVE_SIG_PREG_OFFSET(vq, i));
}
}
#endif // TARGET_OSX
#elif defined(HOST_ARM)
const VfpSigFrame* fp = GetConstNativeSigSimdContext(native);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/arm64/asmconstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ ASMCONSTANTS_C_ASSERT(SIZEOF__GSCookie == sizeof(GSCookie));
#define SIZEOF__Frame 0x10
ASMCONSTANTS_C_ASSERT(SIZEOF__Frame == sizeof(Frame));

#define SIZEOF__CONTEXT 0x390
#define SIZEOF__CONTEXT 0x5e0
ASMCONSTANTS_C_ASSERT(SIZEOF__CONTEXT == sizeof(T_CONTEXT));


Expand Down
Loading