From fbfc3fbc19ee0abcd745a3c68b89c395eba7ca13 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Mon, 19 May 2025 15:07:20 -0700 Subject: [PATCH 1/2] Move vzeroupper emit back to JIT vzeroupper is AVX instruction and so it cannot be executed unconditionally in static asm helpers Fixes #115672 --- src/coreclr/jit/codegen.h | 8 +- src/coreclr/jit/codegencommon.cpp | 4 +- src/coreclr/jit/codegenxarch.cpp | 96 +++++++++++-------- .../Runtime/amd64/ExceptionHandling.S | 6 -- .../Runtime/amd64/ExceptionHandling.asm | 6 -- src/coreclr/vm/amd64/AsmHelpers.asm | 6 -- src/coreclr/vm/amd64/asmhelpers.S | 6 -- 7 files changed, 65 insertions(+), 67 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 25eb7a0c25fbfc..53567b8e3648af 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface #if defined(TARGET_XARCH) // Save/Restore callee saved float regs to stack - void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize); - void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize); + void genPreserveCalleeSavedFltRegs(); + void genRestoreCalleeSavedFltRegs(); + + // Generate vzeroupper instruction to clear AVX state if necessary + void genClearAvxStateInProlog(); + void genClearAvxStateInEpilog(); #endif // TARGET_XARCH diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 4496e006e9b541..a56f8edcc9e391 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog() #endif // TARGET_ARMARCH #if defined(TARGET_XARCH) + genClearAvxStateInProlog(); + // Preserve callee saved float regs to stack. - genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize); + genPreserveCalleeSavedFltRegs(); #endif // defined(TARGET_XARCH) #ifdef TARGET_AMD64 diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 01f639dc7ddc14..0283b780b1496c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -10452,8 +10452,10 @@ void CodeGen::genFnEpilog(BasicBlock* block) } #endif + genClearAvxStateInEpilog(); + // Restore float registers that were saved to stack before SP is modified. - genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize); + genRestoreCalleeSavedFltRegs(); #ifdef JIT32_GCENCODER // When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after @@ -10913,6 +10915,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block) // This is the end of the OS-reported prolog for purposes of unwinding compiler->unwindEndProlog(); + + genClearAvxStateInProlog(); } /***************************************************************************** @@ -10933,6 +10937,8 @@ void CodeGen::genFuncletEpilog() ScopedSetVariable _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); + genClearAvxStateInEpilog(); + inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE); instGen_Return(0); } @@ -11030,6 +11036,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block) // Add a padding for 16-byte alignment inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE); #endif + + genClearAvxStateInProlog(); } /***************************************************************************** @@ -11048,6 +11056,8 @@ void CodeGen::genFuncletEpilog() ScopedSetVariable _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); + genClearAvxStateInEpilog(); + #ifdef UNIX_X86_ABI // Revert a padding that was added for 16-byte alignment inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE); @@ -11337,40 +11347,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu // Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE] // Here offset = 16-byte aligned offset after pushing integer registers. -// -// Params -// lclFrameSize - Fixed frame size excluding callee pushed int regs. -// non-funclet: this will be compLclFrameSize. -// funclet frames: this will be FuncletInfo.fiSpDelta. -void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) +void CodeGen::genPreserveCalleeSavedFltRegs() { regMaskTP regMask = compiler->compCalleeFPRegsSavedMask; // Only callee saved floating point registers should be in regMask assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask); - if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX()) - { - // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: - // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean - // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a - // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX - // register) and before any call to an unknown function. - - // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher - // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue - // This reduces the overall amount of codegen, particularly for more common paths not using any - // SIMD or floating-point. - - instGen(INS_vzeroupper); - } - // fast path return if (regMask == RBM_NONE) { return; } + unsigned lclFrameSize = compiler->compLclFrameSize; + #ifdef TARGET_AMD64 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0; unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES; @@ -11400,35 +11391,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE] // Here offset = 16-byte aligned offset after pushing integer registers. -// -// Params -// lclFrameSize - Fixed frame size excluding callee pushed int regs. -// non-funclet: this will be compLclFrameSize. -// funclet frames: this will be FuncletInfo.fiSpDelta. -void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) +void CodeGen::genRestoreCalleeSavedFltRegs() { regMaskTP regMask = compiler->compCalleeFPRegsSavedMask; // Only callee saved floating point registers should be in regMask assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask); - if (GetEmitter()->Contains256bitOrMoreAVX()) - { - // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: - // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean - // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a - // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX - // register) and before any call to an unknown function. - - instGen(INS_vzeroupper); - } - // fast path return if (regMask == RBM_NONE) { return; } + unsigned lclFrameSize = compiler->compLclFrameSize; + #ifdef TARGET_AMD64 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0; instruction copyIns = ins_Copy(TYP_FLOAT); @@ -11470,6 +11447,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) } } +//----------------------------------------------------------------------------------- +// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog +// +void CodeGen::genClearAvxStateInProlog() +{ + if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX()) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher + // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue + // This reduces the overall amount of codegen, particularly for more common paths not using any + // SIMD or floating-point. + + instGen(INS_vzeroupper); + } +} + +//----------------------------------------------------------------------------------- +// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog +// +void CodeGen::genClearAvxStateInEpilog() +{ + if (GetEmitter()->Contains256bitOrMoreAVX()) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + instGen(INS_vzeroupper); + } +} + //----------------------------------------------------------------------------------- // instGen_MemoryBarrier: Emit a MemoryBarrier instruction // diff --git a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S index 9df1f73b3ab410..d2bce874cecaca 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S +++ b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S @@ -245,9 +245,6 @@ NESTED_END RhpRethrow, _TEXT alloc_stack stack_alloc_size - // Mirror clearing of AVX state done by regular method prologs - vzeroupper - END_PROLOGUE .endm @@ -255,9 +252,6 @@ NESTED_END RhpRethrow, _TEXT // Epilogue of all funclet calling helpers (RhpCallXXXXFunclet) // .macro FUNCLET_CALL_EPILOGUE - // Mirror clearing of AVX state done by regular method epilogs - vzeroupper - free_stack stack_alloc_size pop_nonvol_reg rbp diff --git a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm index b8f0582b4f0456..e42f6fd3e2d8aa 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm @@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack alloc_stack stack_alloc_size - ;; Mirror clearing of AVX state done by regular method prologs - vzeroupper - save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h) save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h) save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h) @@ -329,9 +326,6 @@ endm ;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet) ;; FUNCLET_CALL_EPILOGUE macro - ;; Mirror clearing of AVX state done by regular method epilogs - vzeroupper - movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h] movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h] movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h] diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index b80a2e51c69809..03734a72daf823 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -523,9 +523,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack alloc_stack stack_alloc_size - ;; Mirror clearing of AVX state done by regular method prologs - vzeroupper - save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h) save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h) save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h) @@ -544,9 +541,6 @@ endm ;; Epilogue of all funclet calling helpers (CallXXXXFunclet) ;; FUNCLET_CALL_EPILOGUE macro - ;; Mirror clearing of AVX state done by regular method epilogs - vzeroupper - movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h] movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h] movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h] diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index c90c535a493840..e2b98cfdd46e27 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -378,9 +378,6 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT alloc_stack stack_alloc_size - // Mirror clearing of AVX state done by regular method prologs - vzeroupper - END_PROLOGUE .endm @@ -388,9 +385,6 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // Epilogue of all funclet calling helpers (CallXXXXFunclet) // .macro FUNCLET_CALL_EPILOGUE - // Mirror clearing of AVX state done by regular method epilogs - vzeroupper - free_stack stack_alloc_size pop_nonvol_reg rbp From 9aa22d9f113928fdadd435d6cb989b115cb9faf0 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Mon, 19 May 2025 16:06:49 -0700 Subject: [PATCH 2/2] Fix formatting --- src/coreclr/jit/codegenxarch.cpp | 6 ++++-- src/coreclr/jit/morph.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 615e969e8ef3c8..1c17e0b4e1d497 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4879,7 +4879,8 @@ void CodeGen::genCodeForShift(GenTree* tree) { int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); - if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags()) + if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && + !tree->gtSetFlags()) { // If we have a contained source operand, we must emit rorx. // We may also use rorx for 64bit values when a mov would otherwise be required, @@ -4906,7 +4907,8 @@ void CodeGen::genCodeForShift(GenTree* tree) return; } } - else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && !tree->gtSetFlags()) + else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && + !tree->gtSetFlags()) { // Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr. switch (tree->OperGet()) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 2d2338adae6dd9..b52c8e48cbe0e4 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9212,7 +9212,7 @@ GenTree* Compiler::fgOptimizeRelationalComparisonWithConst(GenTreeOp* cmp) // LE_UN/GT_UN(expr, int.MaxValue) => EQ/NE(RSZ(expr, 32), 0). else if (opts.OptimizationEnabled() && (op1->TypeIs(TYP_LONG) && (op2Value == UINT_MAX))) { - oper = (oper == GT_GT) ? GT_NE : GT_EQ; + oper = (oper == GT_GT) ? GT_NE : GT_EQ; GenTree* icon32 = gtNewIconNode(32, TYP_INT); icon32->SetMorphed(this);