Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,8 @@ internal unsafe ref struct AsyncDispatcherInfo
public static partial class AsyncHelpers
{
#if FEATURE_INTERPRETER
[LibraryImport(RuntimeHelpers.QCall, EntryPoint = "AsyncHelpers_ResumeInterpreterContinuation")]
[StackTraceHidden]
private static partial void AsyncHelpers_ResumeInterpreterContinuation(ObjectHandleOnStack cont, ref byte resultStorage);

[StackTraceHidden]
internal static Continuation? ResumeInterpreterContinuation(Continuation cont, ref byte resultStorage)
{
ObjectHandleOnStack contHandle = ObjectHandleOnStack.Create(ref cont);
AsyncHelpers_ResumeInterpreterContinuation(contHandle, ref resultStorage);
return cont;
}
[MethodImpl(MethodImplOptions.InternalCall)]
internal static extern Continuation? ResumeInterpreterContinuation(Continuation cont, ref byte resultStorage);
#endif

// This is the "magic" method on which other "Await" methods are built.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,8 @@ class AsmOffsets

#if TARGET_64BIT
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x8;
#if FEATURE_INTERPRETER
#if TARGET_AMD64 && !TARGET_UNIX
public const int SIZEOF__StackFrameIterator = 0x178;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x170;
#else
public const int SIZEOF__StackFrameIterator = 0x170;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x168;
#endif
#else
public const int SIZEOF__StackFrameIterator = 0x150;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x148;
#endif
public const int OFFSETOF__StackFrameIterator__m_isRuntimeWrappedExceptions = 0x132;
#elif TARGET_X86
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x4;
Expand All @@ -78,13 +68,8 @@ class AsmOffsets
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x3cc;
#else // TARGET_64BIT
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x4;
#if FEATURE_INTERPRETER
public const int SIZEOF__StackFrameIterator = 0xd8;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0xd4;
#else
public const int SIZEOF__StackFrameIterator = 0xc8;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0xc4;
#endif
public const int OFFSETOF__StackFrameIterator__m_isRuntimeWrappedExceptions = 0xba;
#endif // TARGET_64BIT

Expand Down Expand Up @@ -134,18 +119,8 @@ class AsmOffsets

#if TARGET_64BIT
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x8;
#if FEATURE_INTERPRETER
#if TARGET_UNIX
public const int SIZEOF__StackFrameIterator = 0x168;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x160;
#else // TARGET_UNIX
public const int SIZEOF__StackFrameIterator = 0x170;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x168;
#endif // TARGET_UNIX
#else
public const int SIZEOF__StackFrameIterator = 0x148;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x140;
#endif
public const int OFFSETOF__StackFrameIterator__m_isRuntimeWrappedExceptions = 0x12a;
#elif TARGET_X86
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x4;
Expand All @@ -154,13 +129,8 @@ class AsmOffsets
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0x3c4;
#else // TARGET_64BIT
public const int OFFSETOF__REGDISPLAY__m_pCurrentContext = 0x4;
#if FEATURE_INTERPRETER
public const int SIZEOF__StackFrameIterator = 0xd0;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0xcc;
#else
public const int SIZEOF__StackFrameIterator = 0xc0;
public const int OFFSETOF__StackFrameIterator__m_AdjustedControlPC = 0xbc;
#endif
public const int OFFSETOF__StackFrameIterator__m_isRuntimeWrappedExceptions = 0xb2;
#endif // TARGET_64BIT

Expand Down
35 changes: 33 additions & 2 deletions src/coreclr/pal/inc/unixasmmacrosarm.inc
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,35 @@ C_FUNC(\Name):
// d2
// d1
// d0 <- __PWTB_FloatArgumentRegisters
.macro PROLOG_WITH_TRANSITION_BLOCK extraLocals = 0, saveFpArgs = 1, pushArgRegs = 0
// Optional: Callee saved floating point registers (if pushCalleeSavedFloatRegs=1)
// d15
// d14
// d13
// d12
// d11
// d10
// d9
// d8 <- __PWTB_FloatCalleeSavedRegisters
//
// pushCalleeSavedFloatRegs - Optional parameter. If set to 1, the macro will also save
// the callee-saved floating point registers (d8-d15) to the stack.
// These registers are NOT restored by the EPILOG_WITH_TRANSITION_BLOCK variants.
//
.macro PROLOG_WITH_TRANSITION_BLOCK extraLocals = 0, saveFpArgs = 1, pushArgRegs = 0, pushCalleeSavedFloatRegs = 0

__PWTB_FloatArgumentRegisters = \extraLocals
__PWTB_FloatCalleeSavedRegisters = \extraLocals
__PWTB_SaveFPArgs = \saveFpArgs
__PWTB_PushCalleeSavedFloatRegs = \pushCalleeSavedFloatRegs

// If pushCalleeSavedFloatRegs is specified, reserve space for d8-d15 (8 registers * 8 bytes = 64 bytes)
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
.if ((__PWTB_FloatCalleeSavedRegisters % 8) != 0)
__PWTB_FloatCalleeSavedRegisters = __PWTB_FloatCalleeSavedRegisters + 4
.endif
__PWTB_FloatArgumentRegisters = __PWTB_FloatCalleeSavedRegisters + 64
.else
__PWTB_FloatArgumentRegisters = \extraLocals
.endif

.if (__PWTB_SaveFPArgs == 1)
.if ((__PWTB_FloatArgumentRegisters % 8) != 0)
Expand Down Expand Up @@ -189,6 +214,12 @@ C_FUNC(\Name):
vstm r6, {d0-d7}
.endif

// Save callee-saved floating point registers if requested
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
add r6, sp, #(__PWTB_FloatCalleeSavedRegisters)
vstm r6, {d8-d15}
.endif

CHECK_STACK_ALIGNMENT

END_PROLOGUE
Expand Down
40 changes: 35 additions & 5 deletions src/coreclr/pal/inc/unixasmmacrosarm64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,35 @@ C_FUNC(\Name\()_End):
// FloatRegisters::q2
// FloatRegisters::q1
// FloatRegisters::q0
.macro PROLOG_WITH_TRANSITION_BLOCK extraLocals = 0, SaveFPArgs = 1, SaveGPArgs = 1
// Optional: Callee saved floating point registers (if pushCalleeSavedFloatRegs=1)
// d15
// d14
// d13
// d12
// d11
// d10
// d9
// d8
//
// pushCalleeSavedFloatRegs - Optional parameter. If set to 1, the macro will also save
// the callee-saved floating point registers (d8-d15) to the stack.
// These registers are NOT restored by the EPILOG_WITH_TRANSITION_BLOCK variants.
//
.macro PROLOG_WITH_TRANSITION_BLOCK extraLocals = 0, SaveFPArgs = 1, SaveGPArgs = 1, pushCalleeSavedFloatRegs = 0

__PWTB_FloatArgumentRegisters = \extraLocals
__PWTB_FloatCalleeSavedRegisters = \extraLocals
__PWTB_SaveFPArgs = \SaveFPArgs
__PWTB_PushCalleeSavedFloatRegs = \pushCalleeSavedFloatRegs

.if ((__PWTB_FloatCalleeSavedRegisters % 16) != 0)
__PWTB_FloatCalleeSavedRegisters = __PWTB_FloatCalleeSavedRegisters + 8
.endif

.if ((__PWTB_FloatArgumentRegisters % 16) != 0)
__PWTB_FloatArgumentRegisters = __PWTB_FloatArgumentRegisters + 8
// If pushCalleeSavedFloatRegs is specified, reserve space for d8-d15 (8 registers * 8 bytes = 64 bytes)
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
__PWTB_FloatArgumentRegisters = __PWTB_FloatCalleeSavedRegisters + 64
.else
__PWTB_FloatArgumentRegisters = __PWTB_FloatCalleeSavedRegisters
.endif

__PWTB_TransitionBlock = __PWTB_FloatArgumentRegisters
Expand Down Expand Up @@ -234,7 +256,15 @@ C_FUNC(\Name\()_End):
.endif

.if (__PWTB_SaveFPArgs == 1)
SAVE_FLOAT_ARGUMENT_REGISTERS sp, \extraLocals
SAVE_FLOAT_ARGUMENT_REGISTERS sp, __PWTB_FloatArgumentRegisters
.endif

// Save callee-saved floating point registers if requested
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
PROLOG_SAVE_REG_PAIR d8, d9, __PWTB_FloatCalleeSavedRegisters
PROLOG_SAVE_REG_PAIR d10, d11, __PWTB_FloatCalleeSavedRegisters + 16
PROLOG_SAVE_REG_PAIR d12, d13, __PWTB_FloatCalleeSavedRegisters + 32
PROLOG_SAVE_REG_PAIR d14, d15, __PWTB_FloatCalleeSavedRegisters + 48
.endif

.endm
Expand Down
36 changes: 32 additions & 4 deletions src/coreclr/pal/inc/unixasmmacrosriscv64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -260,14 +260,26 @@ C_FUNC(\Name):
// FPR_f8 / fs0
// Extra:
//
.macro PROLOG_WITH_TRANSITION_BLOCK extraParameters = 0, extraLocals = 0, SaveFPRegs = 1
// pushCalleeSavedFloatRegs - Optional parameter. If set to 1, the macro will also save
// the callee-saved floating point registers (fs0-fs11) to the stack.
// These registers are NOT restored by the EPILOG_WITH_TRANSITION_BLOCK variants.
//
.macro PROLOG_WITH_TRANSITION_BLOCK extraParameters = 0, extraLocals = 0, SaveFPRegs = 1, pushCalleeSavedFloatRegs = 0
__PWTB_SaveFPArgs = \SaveFPRegs
__PWTB_PushCalleeSavedFloatRegs = \pushCalleeSavedFloatRegs

__PWTB_FloatArgumentRegisters = \extraLocals
__PWTB_FloatCalleeSavedRegisters = \extraLocals

// Note, stack (see __PWTB_StackAlloc variable) must be 16 byte aligned.
.if ((__PWTB_FloatArgumentRegisters % 16) != 0)
__PWTB_FloatArgumentRegisters = __PWTB_FloatArgumentRegisters + 8
.if ((__PWTB_FloatCalleeSavedRegisters % 16) != 0)
__PWTB_FloatCalleeSavedRegisters = __PWTB_FloatCalleeSavedRegisters + 8
.endif

// If pushCalleeSavedFloatRegs is specified, reserve space for fs0-fs11 (12 registers * 8 bytes = 96 bytes)
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
__PWTB_FloatArgumentRegisters = __PWTB_FloatCalleeSavedRegisters + 96
.else
__PWTB_FloatArgumentRegisters = __PWTB_FloatCalleeSavedRegisters
.endif

__PWTB_TransitionBlock = __PWTB_FloatArgumentRegisters
Expand Down Expand Up @@ -296,6 +308,22 @@ C_FUNC(\Name):
SAVE_FLOAT_ARGUMENT_REGISTERS sp, __PWTB_FloatArgumentRegisters
.endif

// Save callee-saved floating point registers if requested (fs0-fs11)
.if (__PWTB_PushCalleeSavedFloatRegs == 1)
fsd fs0, (__PWTB_FloatCalleeSavedRegisters)(sp)
fsd fs1, (__PWTB_FloatCalleeSavedRegisters + 8)(sp)
fsd fs2, (__PWTB_FloatCalleeSavedRegisters + 16)(sp)
fsd fs3, (__PWTB_FloatCalleeSavedRegisters + 24)(sp)
fsd fs4, (__PWTB_FloatCalleeSavedRegisters + 32)(sp)
fsd fs5, (__PWTB_FloatCalleeSavedRegisters + 40)(sp)
fsd fs6, (__PWTB_FloatCalleeSavedRegisters + 48)(sp)
fsd fs7, (__PWTB_FloatCalleeSavedRegisters + 56)(sp)
fsd fs8, (__PWTB_FloatCalleeSavedRegisters + 64)(sp)
fsd fs9, (__PWTB_FloatCalleeSavedRegisters + 72)(sp)
fsd fs10, (__PWTB_FloatCalleeSavedRegisters + 80)(sp)
fsd fs11, (__PWTB_FloatCalleeSavedRegisters + 88)(sp)
.endif

.endm

.macro EPILOG_WITH_TRANSITION_BLOCK_RETURN
Expand Down
53 changes: 52 additions & 1 deletion src/coreclr/vm/amd64/AsmHelpers.asm
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ extern IL_Rethrow_Impl:proc
ifdef FEATURE_INTERPRETER
extern ExecuteInterpretedMethod:proc
extern GetInterpThreadContextWithPossiblyMissingThreadOrCallStub:proc
extern CallInterpreterFuncletWorker:proc
endif

extern g_pPollGC:QWORD
Expand Down Expand Up @@ -559,7 +560,7 @@ ifdef FEATURE_INTERPRETER

NESTED_ENTRY InterpreterStub, _TEXT

PROLOG_WITH_TRANSITION_BLOCK
PROLOG_WITH_TRANSITION_BLOCK 0, <PushCalleeSavedFloatRegs>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that, instead of using libunwind to obtain the set of callee saved registers at this interpreter entry location, we store them explicitly each time into the transition block. Trying to understand the benefits, do we have a strong motivation to do this ? Given we would make interpreter entry even more expensive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've tried to describe the motivations in the PR description. But let me share more details:

  • There is an ongoing effort to get rid of libunwind dependency, @am11 has been making changes towards that goal. If we don't capture callee saved floats in the transition block, then it seems the only other way to restore them properly is to unwind from a known native context.
  • Debugger single stepping machinery for interpreted code needs to start on interpreter context. But it unwinds the stack from there in some cases and if it leaves the first interpreted frame under an InterpreterFrame, it needs to move to the managed or native context of the caller of that interpreted frame. In non-debugger scenarios, it uses the saved SP, FP, IP and the first argument register to restore the context to the native context inside of the InterpExecMethod and then unwinds it until it finds the caller mentioned above. Saving of these registers in the context happens in the stack frame iterator when it hits the InterpreterFrame for the first time before it overwrites them by the interpreter context. But that doesn't happen in the debugger single stepping scenario when we start the stack frame iterator in the middle of the interpreted code. I guess we can solve this in a different way if we had to.

The main motivation was the first one, the second one just moving the needle more in the motivation.
I am not worried about adding extra cost at the exceptionally called funclet entry. For AOT to interpreter transitions, my hope is that pushing the extra floating point callee saved registers won't have visible perf effect given all the other machinery and cost of interpreted code execution. Since you now have made some perf benchmarking, it seems we have a baseline to see if it has any visible effect or not.

A minor extra motivation is that it makes the stack frame iterator code cleaner, but that was more of a nice outcome than anything else.

Copy link
Member

@BrzVlad BrzVlad Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation. I was wondering mainly about the main motivation behind getting rid of libunwind. (And whether we want to do this at the expense of minor potential regressions)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

libunwind’s unw_step interprets DWARF CFI data to walk stack frames. That interpretation step is inherently more expensive than adjusting our own prologue/epilogue metadata and performing direct register updates for predictable, deterministic transitions, where a lightweight unwinding strategy is sufficient.

From a maintenance perspective, porting libunwind to new platforms is typically non-trivial and requires us to carry and service the external codebase (src/native/external/libunwind). Reducing this dependency would simplify our platform surface area.

Dependency footprint in terms of PAL_VirtualUnwind (which is the entrypoint to unw_step) call sites:

$ git grep 'PAL_VirtualUnwind' dotnet/release/8.0 -- src/coreclr | wc -l
      41
$ git grep 'PAL_VirtualUnwind' dotnet/release/9.0 -- src/coreclr | wc -l
      42
$ git grep 'PAL_VirtualUnwind' dotnet/release/10.0 -- src/coreclr | wc -l
      25
$ git grep 'PAL_VirtualUnwind' dotnet/main -- src/coreclr | wc -l
      25

We may not be able to eliminate all usages, but we can continue reducing the dependency and evaluate alternative strategies for the rest, however feasible.

Runtimes that control their code generation (go, java..) typically also control their unwinding strategy and do not rely on general OS unwinding for managed frames. At native–managed boundaries, behavior is usually constrained, and cross-boundary unwinding is either disallowed or treated as a fatal condition.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if unwinding via libunwind is slower, it seems we would want to prioritize normal execution path, rather than the exceptional flow. If we have the option of using it on some platforms, to me it seems like we should keep this option open. Maybe have the possibility of using it to populate the full register context only for a subset of frame types, like the interpreter transitions. Of course, these are only some thoughts from a high level perspective, this might not necessarily be feasible for what it's worth.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an ongoing effort to get rid of libunwind dependency, @am11 has been making changes towards that goal. If we don't capture callee saved floats in the transition block, then it seems the only other way to restore them properly is to unwind from a known native context.

The other pattern that can be used to get rid of libunwind dependencies is try/catch the C++ exception in C++ and return non-exceptionally to the caller. #123482 is proposing that we do that for QCalls. You pay the cost of C++ try/catch and error return check vs. the cost of arch-specific asm helpers that must save a bunch of registers. (I understand that this alternative may not be suitable in this case.)

For AOT to interpreter transitions, my hope is that pushing the extra floating point callee saved registers won't have visible perf effect

Some perf numbers would be nice.


__InterpreterStubArgumentRegistersOffset = __PWTB_ArgumentRegisters
; IR bytecode address
Expand Down Expand Up @@ -1286,6 +1287,56 @@ END_PROLOGUE
ret
NESTED_END CallJittedMethodRetU2, _TEXT

;==========================================================================
; Create a real TransitionBlock and call CallInterpreterFuncletWorker
; to execute an interpreter funclet (catch/finally/filter handler).
;
; extern "C" DWORD_PTR CallInterpreterFunclet(
; OBJECTREF throwable, // rcx
; void* pHandler, // rdx
; REGDISPLAY *pRD, // r8
; ExInfo *pExInfo, // r9
; bool isFilter // [rsp+28h]
; );
;==========================================================================
extern CallInterpreterFuncletWorker:proc

NESTED_ENTRY CallInterpreterFunclet, _TEXT

PROLOG_WITH_TRANSITION_BLOCK 16, <PushCalleeSavedFloatRegs>

; Pass TransitionBlock* as last (6th) argument on stack
; Worker signature: CallInterpreterFuncletWorker(throwable, pHandler, pRD, pExInfo, isFilter, TransitionBlock*)
; Original args: rcx=throwable, rdx=pHandler, r8=pRD, r9=pExInfo, [rsp+__PWTB_ArgumentRegisters+20h]=isFilter

; Move isFilter to 5th param slot
mov rax, [rsp + __PWTB_ArgumentRegisters + 20h] ; isFilter (5th param from original caller)
mov [rsp + 20h], rax ; pass isFilter as 5th param on stack

; Put TransitionBlock* as 6th param on stack
lea rax, [rsp + __PWTB_TransitionBlock]
mov [rsp + 28h], rax ; TransitionBlock* as 6th param

; rcx, rdx, r8, r9 remain unchanged (throwable, pHandler, pRD, pExInfo)

call CallInterpreterFuncletWorker

EPILOG_WITH_TRANSITION_BLOCK_RETURN

NESTED_END CallInterpreterFunclet, _TEXT

extern AsyncHelpers_ResumeInterpreterContinuationWorker:proc

NESTED_ENTRY AsyncHelpers_ResumeInterpreterContinuation, _TEXT
PROLOG_WITH_TRANSITION_BLOCK 0, <PushCalleeSavedFloatRegs>

lea r8, [rsp + __PWTB_TransitionBlock]
call AsyncHelpers_ResumeInterpreterContinuationWorker

EPILOG_WITH_TRANSITION_BLOCK_RETURN

NESTED_END AsyncHelpers_ResumeInterpreterContinuation, _TEXT

endif ; FEATURE_INTERPRETER

;==========================================================================
Expand Down
27 changes: 25 additions & 2 deletions src/coreclr/vm/amd64/AsmMacros.inc
Original file line number Diff line number Diff line change
Expand Up @@ -355,20 +355,30 @@ RESTORE_FLOAT_ARGUMENT_REGISTERS macro ofs
; xmm2
; xmm1
; xmm0 <- __PWTB_FloatArgumentRegisters
; Optional: Callee saved floating point registers
; xmm15
; .
; .
; xmm6
; extra locals + padding to qword align
; callee's r9
; callee's r8
; callee's rdx
; callee's rcx

PROLOG_WITH_TRANSITION_BLOCK macro extraLocals := <0>, stackAllocOnEntry := <0>, stackAllocSpill1, stackAllocSpill2, stackAllocSpill3
PROLOG_WITH_TRANSITION_BLOCK macro extraLocals := <0>, calleeSavedFloatRegs := <DoNotPushCalleeSavedFloatRegs>, stackAllocOnEntry := <0>, stackAllocSpill1, stackAllocSpill2, stackAllocSpill3

__PWTB_FloatArgumentRegisters = SIZEOF_MAX_OUTGOING_ARGUMENT_HOMES + extraLocals

if (__PWTB_FloatArgumentRegisters mod 16) ne 0
__PWTB_FloatArgumentRegisters = __PWTB_FloatArgumentRegisters + 8
endif

ifidn <calleeSavedFloatRegs>, <PushCalleeSavedFloatRegs>
__PWTB_FloatCalleeSavedRegisters = __PWTB_FloatArgumentRegisters
__PWTB_FloatArgumentRegisters = __PWTB_FloatArgumentRegisters + 10 * 16
endif

__PWTB_StackAlloc = __PWTB_FloatArgumentRegisters + 4 * 16 + 8
__PWTB_TransitionBlock = __PWTB_StackAlloc
__PWTB_ArgumentRegisters = __PWTB_StackAlloc + 9 * 8
Expand Down Expand Up @@ -403,6 +413,19 @@ PROLOG_WITH_TRANSITION_BLOCK macro extraLocals := <0>, stackAllocOnEntry := <0>,
SAVE_ARGUMENT_REGISTERS __PWTB_ArgumentRegisters
SAVE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters

ifidn <calleeSavedFloatRegs>, <PushCalleeSavedFloatRegs>
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters], xmm6
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 10h], xmm7
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 20h], xmm8
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 30h], xmm9
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 40h], xmm10
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 50h], xmm11
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 60h], xmm12
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 70h], xmm13
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 80h], xmm14
movdqa [rsp + __PWTB_FloatCalleeSavedRegisters + 90h], xmm15
endif

if stackAllocOnEntry ge 3*8
mov stackAllocSpill3, [rsp + __PWTB_StackAlloc + 28h]
save_reg_postrsp r13, __PWTB_StackAlloc + 28h
Expand Down Expand Up @@ -490,9 +513,9 @@ POP_COOP_PINVOKE_FRAME macro
; need to capture the complete register state including FP callee-saved registers.
;
; Stack layout (from high to low address after prologue):
; Outgoing argument homes (32 bytes)
; Return address (m_ReturnAddress)
; CalleeSavedRegisters (r15, r14, r13, r12, rbp, rbx, rsi, rdi - 64 bytes) <- TransitionBlock starts here
; Outgoing argument homes (32 bytes)
; FloatArgumentRegisters (xmm0-xmm3, 64 bytes)
; FP Callee-saved registers (xmm6-xmm15, 160 bytes)
; Shadow space for call (32 bytes)
Expand Down
Loading
Loading