dotnet · jkotas · May 20, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
@@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface
 #if defined(TARGET_XARCH)
 
     // Save/Restore callee saved float regs to stack
-    void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
-    void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
+    void genPreserveCalleeSavedFltRegs();
+    void genRestoreCalleeSavedFltRegs();
+
+    // Generate vzeroupper instruction to clear AVX state if necessary
+    void genClearAvxStateInProlog();
+    void genClearAvxStateInEpilog();
 
 #endif // TARGET_XARCH
 

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
@@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog()
 #endif // TARGET_ARMARCH
 
 #if defined(TARGET_XARCH)
+    genClearAvxStateInProlog();
+
     // Preserve callee saved float regs to stack.
-    genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
+    genPreserveCalleeSavedFltRegs();
 #endif // defined(TARGET_XARCH)
 
 #ifdef TARGET_AMD64

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
@@ -4879,7 +4879,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
         {
             int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
 
-            if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags())
+            if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
+                !tree->gtSetFlags())
             {
                 // If we have a contained source operand, we must emit rorx.
                 // We may also use rorx for 64bit values when a mov would otherwise be required,
@@ -4906,7 +4907,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
             return;
         }
     }
-    else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags())
+    else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
+             !tree->gtSetFlags())
     {
         // Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr.
         switch (tree->OperGet())
@@ -10452,8 +10454,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
     }
 #endif
 
+    genClearAvxStateInEpilog();
+
     // Restore float registers that were saved to stack before SP is modified.
-    genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
+    genRestoreCalleeSavedFltRegs();
 
 #ifdef JIT32_GCENCODER
     // When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10913,6 +10917,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
 
     // This is the end of the OS-reported prolog for purposes of unwinding
     compiler->unwindEndProlog();
+
+    genClearAvxStateInProlog();
 }
 
 /*****************************************************************************
@@ -10933,6 +10939,8 @@ void CodeGen::genFuncletEpilog()
 
     ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
 
+    genClearAvxStateInEpilog();
+
     inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
     instGen_Return(0);
 }
@@ -11030,6 +11038,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
     // Add a padding for 16-byte alignment
     inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
 #endif
+
+    genClearAvxStateInProlog();
 }
 
 /*****************************************************************************
@@ -11048,6 +11058,8 @@ void CodeGen::genFuncletEpilog()
 
     ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
 
+    genClearAvxStateInEpilog();
+
 #ifdef UNIX_X86_ABI
     // Revert a padding that was added for 16-byte alignment
     inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE);
@@ -11337,40 +11349,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
 // Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
-//
-// Params
-//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
-//             non-funclet: this will be compLclFrameSize.
-//             funclet frames: this will be FuncletInfo.fiSpDelta.
-void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
+void CodeGen::genPreserveCalleeSavedFltRegs()
 {
     regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
 
-    if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
-    {
-        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
-        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
-        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
-        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
-        //   register) and before any call to an unknown function.
-
-        // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
-        // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
-        // This reduces the overall amount of codegen, particularly for more common paths not using any
-        // SIMD or floating-point.
-
-        instGen(INS_vzeroupper);
-    }
-
     // fast path return
     if (regMask == RBM_NONE)
     {
         return;
     }
 
+    unsigned lclFrameSize = compiler->compLclFrameSize;
+
 #ifdef TARGET_AMD64
     unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
     unsigned offset            = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11400,35 +11393,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
-//
-// Params
-//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
-//             non-funclet: this will be compLclFrameSize.
-//             funclet frames: this will be FuncletInfo.fiSpDelta.
-void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
+void CodeGen::genRestoreCalleeSavedFltRegs()
 {
     regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
 
-    if (GetEmitter()->Contains256bitOrMoreAVX())
-    {
-        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
-        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
-        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
-        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
-        //   register) and before any call to an unknown function.
-
-        instGen(INS_vzeroupper);
-    }
-
     // fast path return
     if (regMask == RBM_NONE)
     {
         return;
     }
 
+    unsigned lclFrameSize = compiler->compLclFrameSize;
+
 #ifdef TARGET_AMD64
     unsigned    firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
     instruction copyIns           = ins_Copy(TYP_FLOAT);
@@ -11470,6 +11449,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     }
 }
 
+//-----------------------------------------------------------------------------------
+// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
+//
+void CodeGen::genClearAvxStateInProlog()
+{
+    if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
+    {
+        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
+        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
+        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
+        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
+        //   register) and before any call to an unknown function.
+
+        // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
+        // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
+        // This reduces the overall amount of codegen, particularly for more common paths not using any
+        // SIMD or floating-point.
+
+        instGen(INS_vzeroupper);
+    }
+}
+
+//-----------------------------------------------------------------------------------
+// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
+//
+void CodeGen::genClearAvxStateInEpilog()
+{
+    if (GetEmitter()->Contains256bitOrMoreAVX())
+    {
+        // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
+        //   Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
+        //   between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
+        //   VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
+        //   register) and before any call to an unknown function.
+
+        instGen(INS_vzeroupper);
+    }
+}
+
 //-----------------------------------------------------------------------------------
 // instGen_MemoryBarrier: Emit a MemoryBarrier instruction
 //

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
@@ -9212,7 +9212,7 @@ GenTree* Compiler::fgOptimizeRelationalComparisonWithConst(GenTreeOp* cmp)
             // LE_UN/GT_UN(expr, int.MaxValue) => EQ/NE(RSZ(expr, 32), 0).
             else if (opts.OptimizationEnabled() && (op1->TypeIs(TYP_LONG) && (op2Value == UINT_MAX)))
             {
-                oper = (oper == GT_GT) ? GT_NE : GT_EQ;
+                oper            = (oper == GT_GT) ? GT_NE : GT_EQ;
                 GenTree* icon32 = gtNewIconNode(32, TYP_INT);
                 icon32->SetMorphed(this);
 

@@ -245,19 +245,13 @@ NESTED_END RhpRethrow, _TEXT
 
     alloc_stack     stack_alloc_size
 
-    // Mirror clearing of AVX state done by regular method prologs
-    vzeroupper
-
     END_PROLOGUE
 .endm
 
 //
 // Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
 //
 .macro FUNCLET_CALL_EPILOGUE
-    // Mirror clearing of AVX state done by regular method epilogs
-    vzeroupper
-
     free_stack      stack_alloc_size
 
     pop_nonvol_reg rbp

@@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
 
     alloc_stack     stack_alloc_size
 
-    ;; Mirror clearing of AVX state done by regular method prologs
-    vzeroupper
-
     save_xmm128_postrsp xmm6,  (arguments_scratch_area_size + 0 * 10h)
     save_xmm128_postrsp xmm7,  (arguments_scratch_area_size + 1 * 10h)
     save_xmm128_postrsp xmm8,  (arguments_scratch_area_size + 2 * 10h)
@@ -329,9 +326,6 @@ endm
 ;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
 ;;
 FUNCLET_CALL_EPILOGUE macro
-    ;; Mirror clearing of AVX state done by regular method epilogs
-    vzeroupper
-
     movdqa  xmm6,  [rsp + arguments_scratch_area_size + 0 * 10h]
     movdqa  xmm7,  [rsp + arguments_scratch_area_size + 1 * 10h]
     movdqa  xmm8,  [rsp + arguments_scratch_area_size + 2 * 10h]

diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm
@@ -506,9 +506,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
 
         alloc_stack     stack_alloc_size
 
-        ;; Mirror clearing of AVX state done by regular method prologs
-        vzeroupper
-
         save_xmm128_postrsp xmm6,  (arguments_scratch_area_size + 0 * 10h)
         save_xmm128_postrsp xmm7,  (arguments_scratch_area_size + 1 * 10h)
         save_xmm128_postrsp xmm8,  (arguments_scratch_area_size + 2 * 10h)
@@ -527,9 +524,6 @@ endm
 ;; Epilogue of all funclet calling helpers (CallXXXXFunclet)
 ;;
 FUNCLET_CALL_EPILOGUE macro
-        ;; Mirror clearing of AVX state done by regular method epilogs
-        vzeroupper
-
         movdqa  xmm6,  [rsp + arguments_scratch_area_size + 0 * 10h]
         movdqa  xmm7,  [rsp + arguments_scratch_area_size + 1 * 10h]
         movdqa  xmm8,  [rsp + arguments_scratch_area_size + 2 * 10h]

diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S
@@ -359,19 +359,13 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
 
         alloc_stack     stack_alloc_size
 
-        // Mirror clearing of AVX state done by regular method prologs
-        vzeroupper
-
         END_PROLOGUE
 .endm
 
 //
 // Epilogue of all funclet calling helpers (CallXXXXFunclet)
 //
 .macro FUNCLET_CALL_EPILOGUE
-        // Mirror clearing of AVX state done by regular method epilogs
-        vzeroupper
-
         free_stack      stack_alloc_size
 
         pop_nonvol_reg rbp