From fbfc3fbc19ee0abcd745a3c68b89c395eba7ca13 Mon Sep 17 00:00:00 2001
From: Jan Kotas <jkotas@microsoft.com>
Date: Mon, 19 May 2025 15:07:20 -0700
Subject: [PATCH 1/2] Move vzeroupper emit back to JIT

vzeroupper is AVX instruction and so it cannot be executed unconditionally in static asm helpers

Fixes #115672
---
 src/coreclr/jit/codegen.h                     |  8 +-
 src/coreclr/jit/codegencommon.cpp             |  4 +-
 src/coreclr/jit/codegenxarch.cpp              | 96 +++++++++++--------
 .../Runtime/amd64/ExceptionHandling.S         |  6 --
 .../Runtime/amd64/ExceptionHandling.asm       |  6 --
 src/coreclr/vm/amd64/AsmHelpers.asm           |  6 --
 src/coreclr/vm/amd64/asmhelpers.S             |  6 --
 7 files changed, 65 insertions(+), 67 deletions(-)
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 25eb7a0c25fbfc..53567b8e3648af 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface
 #if defined(TARGET_XARCH)
 
     // Save/Restore callee saved float regs to stack
-    void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
-    void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
+    void genPreserveCalleeSavedFltRegs();
+    void genRestoreCalleeSavedFltRegs();
+
+    // Generate vzeroupper instruction to clear AVX state if necessary
+    void genClearAvxStateInProlog();
+    void genClearAvxStateInEpilog();
 
 #endif // TARGET_XARCH
 
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 4496e006e9b541..a56f8edcc9e391 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog()
 #endif // TARGET_ARMARCH
 
 #if defined(TARGET_XARCH)
+    genClearAvxStateInProlog();
+
     // Preserve callee saved float regs to stack.
-    genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
+    genPreserveCalleeSavedFltRegs();
 #endif // defined(TARGET_XARCH)
 
 #ifdef TARGET_AMD64
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 01f639dc7ddc14..0283b780b1496c 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -10452,8 +10452,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
     }
 #endif
 
+    genClearAvxStateInEpilog();
+
     // Restore float registers that were saved to stack before SP is modified.
-    genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
+    genRestoreCalleeSavedFltRegs();
 
 #ifdef JIT32_GCENCODER
     // When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10913,6 +10915,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
 
     // This is the end of the OS-reported prolog for purposes of unwinding
     compiler->unwindEndProlog();
+
+    genClearAvxStateInProlog();
 }
 
 /*****************************************************************************
@@ -10933,6 +10937,8 @@ void CodeGen::genFuncletEpilog()
 
     ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
 
+    genClearAvxStateInEpilog();
+
     inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
     instGen_Return(0);
 }
@@ -11030,6 +11036,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
     // Add a padding for 16-byte alignment
     inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
 #endif
+
+    genClearAvxStateInProlog();
 }
 
 /*****************************************************************************
@@ -11048,6 +11056,8 @@ void CodeGen::genFuncletEpilog()
 
     ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
 
+    genClearAvxStateInEpilog();
+
 #ifdef UNIX_X86_ABI
     // Revert a padding that was added for 16-byte alignment
     inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE);
@@ -11337,40 +11347,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
 // Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
-//
-// Params
-//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
-//             non-funclet: this will be compLclFrameSize.
-//             funclet frames: this will be FuncletInfo.fiSpDelta.
-void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
+void CodeGen::genPreserveCalleeSavedFltRegs()
 {
     regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
 
-    if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
-    {
-        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
-        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
-        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
-        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
-        //   register) and before any call to an unknown function.
-
-        // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
-        // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
-        // This reduces the overall amount of codegen, particularly for more common paths not using any
-        // SIMD or floating-point.
-
-        instGen(INS_vzeroupper);
-    }
-
     // fast path return
     if (regMask == RBM_NONE)
     {
         return;
     }
 
+    unsigned lclFrameSize = compiler->compLclFrameSize;
+
 #ifdef TARGET_AMD64
     unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
     unsigned offset            = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11400,35 +11391,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
-//
-// Params
-//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
-//             non-funclet: this will be compLclFrameSize.
-//             funclet frames: this will be FuncletInfo.fiSpDelta.
-void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
+void CodeGen::genRestoreCalleeSavedFltRegs()
 {
     regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
 
-    if (GetEmitter()->Contains256bitOrMoreAVX())
-    {
-        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
-        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
-        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
-        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
-        //   register) and before any call to an unknown function.
-
-        instGen(INS_vzeroupper);
-    }
-
     // fast path return
     if (regMask == RBM_NONE)
     {
         return;
     }
 
+    unsigned lclFrameSize = compiler->compLclFrameSize;
+
 #ifdef TARGET_AMD64
     unsigned    firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
     instruction copyIns           = ins_Copy(TYP_FLOAT);
@@ -11470,6 +11447,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     }
 }
 
+//-----------------------------------------------------------------------------------
+// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
+//
+void CodeGen::genClearAvxStateInProlog()
+{
+    if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
+    {
+        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
+        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
+        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
+        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
+        //   register) and before any call to an unknown function.
+
+        // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
+        // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
+        // This reduces the overall amount of codegen, particularly for more common paths not using any
+        // SIMD or floating-point.
+
+        instGen(INS_vzeroupper);
+    }
+}
+
+//-----------------------------------------------------------------------------------
+// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
+//
+void CodeGen::genClearAvxStateInEpilog()
+{
+    if (GetEmitter()->Contains256bitOrMoreAVX())
+    {
+        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
+        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
+        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
+        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
+        //   register) and before any call to an unknown function.
+
+        instGen(INS_vzeroupper);
+    }
+}
+
 //-----------------------------------------------------------------------------------
 // instGen_MemoryBarrier: Emit a MemoryBarrier instruction
 //
diff --git a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S
index 9df1f73b3ab410..d2bce874cecaca 100644
--- a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S
+++ b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S
@@ -245,9 +245,6 @@ NESTED_END RhpRethrow, _TEXT
 
     alloc_stack     stack_alloc_size
 
-    // Mirror clearing of AVX state done by regular method prologs
-    vzeroupper
-
     END_PROLOGUE
 .endm
 
@@ -255,9 +252,6 @@ NESTED_END RhpRethrow, _TEXT
 // Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
 //
 .macro FUNCLET_CALL_EPILOGUE
-    // Mirror clearing of AVX state done by regular method epilogs
-    vzeroupper
-
     free_stack      stack_alloc_size
 
     pop_nonvol_reg rbp
diff --git a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm
index b8f0582b4f0456..e42f6fd3e2d8aa 100644
--- a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm
+++ b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm
@@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
 
     alloc_stack     stack_alloc_size
 
-    ;; Mirror clearing of AVX state done by regular method prologs
-    vzeroupper
-
     save_xmm128_postrsp xmm6,  (arguments_scratch_area_size + 0 * 10h)
     save_xmm128_postrsp xmm7,  (arguments_scratch_area_size + 1 * 10h)
     save_xmm128_postrsp xmm8,  (arguments_scratch_area_size + 2 * 10h)
@@ -329,9 +326,6 @@ endm
 ;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
 ;;
 FUNCLET_CALL_EPILOGUE macro
-    ;; Mirror clearing of AVX state done by regular method epilogs
-    vzeroupper
-
     movdqa  xmm6,  [rsp + arguments_scratch_area_size + 0 * 10h]
     movdqa  xmm7,  [rsp + arguments_scratch_area_size + 1 * 10h]
     movdqa  xmm8,  [rsp + arguments_scratch_area_size + 2 * 10h]
diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm
index b80a2e51c69809..03734a72daf823 100644
--- a/src/coreclr/vm/amd64/AsmHelpers.asm
+++ b/src/coreclr/vm/amd64/AsmHelpers.asm
@@ -523,9 +523,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
 
         alloc_stack     stack_alloc_size
 
-        ;; Mirror clearing of AVX state done by regular method prologs
-        vzeroupper
-
         save_xmm128_postrsp xmm6,  (arguments_scratch_area_size + 0 * 10h)
         save_xmm128_postrsp xmm7,  (arguments_scratch_area_size + 1 * 10h)
         save_xmm128_postrsp xmm8,  (arguments_scratch_area_size + 2 * 10h)
@@ -544,9 +541,6 @@ endm
 ;; Epilogue of all funclet calling helpers (CallXXXXFunclet)
 ;;
 FUNCLET_CALL_EPILOGUE macro
-        ;; Mirror clearing of AVX state done by regular method epilogs
-        vzeroupper
-
         movdqa  xmm6,  [rsp + arguments_scratch_area_size + 0 * 10h]
         movdqa  xmm7,  [rsp + arguments_scratch_area_size + 1 * 10h]
         movdqa  xmm8,  [rsp + arguments_scratch_area_size + 2 * 10h]
diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S
index c90c535a493840..e2b98cfdd46e27 100644
--- a/src/coreclr/vm/amd64/asmhelpers.S
+++ b/src/coreclr/vm/amd64/asmhelpers.S
@@ -378,9 +378,6 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
 
         alloc_stack     stack_alloc_size
 
-        // Mirror clearing of AVX state done by regular method prologs
-        vzeroupper
-
         END_PROLOGUE
 .endm
 
@@ -388,9 +385,6 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
 // Epilogue of all funclet calling helpers (CallXXXXFunclet)
 //
 .macro FUNCLET_CALL_EPILOGUE
-        // Mirror clearing of AVX state done by regular method epilogs
-        vzeroupper
-
         free_stack      stack_alloc_size
 
         pop_nonvol_reg rbp

From 9aa22d9f113928fdadd435d6cb989b115cb9faf0 Mon Sep 17 00:00:00 2001
From: Jan Kotas <jkotas@microsoft.com>
Date: Mon, 19 May 2025 16:06:49 -0700
Subject: [PATCH 2/2] Fix formatting

---
 src/coreclr/jit/codegenxarch.cpp | 6 ++++--
 src/coreclr/jit/morph.cpp        | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 615e969e8ef3c8..1c17e0b4e1d497 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -4879,7 +4879,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
         {
             int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
 
-            if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags())
+            if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
+                !tree->gtSetFlags())
             {
                 // If we have a contained source operand, we must emit rorx.
                 // We may also use rorx for 64bit values when a mov would otherwise be required,
@@ -4906,7 +4907,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
             return;
         }
     }
-    else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags())
+    else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
+             !tree->gtSetFlags())
     {
         // Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr.
         switch (tree->OperGet())
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 2d2338adae6dd9..b52c8e48cbe0e4 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9212,7 +9212,7 @@ GenTree* Compiler::fgOptimizeRelationalComparisonWithConst(GenTreeOp* cmp)
             // LE_UN/GT_UN(expr, int.MaxValue) => EQ/NE(RSZ(expr, 32), 0).
             else if (opts.OptimizationEnabled() && (op1->TypeIs(TYP_LONG) && (op2Value == UINT_MAX)))
             {
-                oper = (oper == GT_GT) ? GT_NE : GT_EQ;
+                oper            = (oper == GT_GT) ? GT_NE : GT_EQ;
                 GenTree* icon32 = gtNewIconNode(32, TYP_INT);
                 icon32->SetMorphed(this);