Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use simd for small prolog zeroing (ia32/x64) #32442

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 197 additions & 53 deletions src/coreclr/src/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6306,73 +6306,217 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
}
#endif // TARGET_ARM64
noway_assert(uCntBytes == 0);

#elif defined(TARGET_XARCH)
/*
Generate the following code:

lea edi, [ebp/esp-OFFS]
mov ecx, <size>
xor eax, eax
rep stosd
*/
// As we output multiple instructions for SIMD zeroing, we want to balance code size, with throughput
// so cap max size at 6 * MAX SIMD length
assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
#ifdef TARGET_64BIT
int initMaxSIMDSize =
6 * (compiler->getSIMDSupportLevel() >= SIMD_AVX2_Supported ? YMM_REGSIZE_BYTES : XMM_REGSIZE_BYTES);
#else // !TARGET_64BIT
int initMaxSIMDSize = 8 * XMM_REGSIZE_BYTES;
#endif // TARGET_64BIT
if ((untrLclHi - untrLclLo) <= initMaxSIMDSize)
{
// Generate the following code:
//
// xor rax, rax
// vxorps ymm0, ymm0
// vmovdqu ymmword ptr [ebp/esp-OFFS], ymm0
// ...
// vmovdqu xmmword ptr [ebp/esp-OFFS], xmm0
// mov qword ptr [ebp/esp-OFFS], rax

// zero out the whole thing rounded up to a single stack slot size
unsigned blkSize = roundUp((untrLclHi - untrLclLo), (unsigned)sizeof(int));
unsigned i = 0;
emitter* emit = GetEmitter();
// Grab a non-argument, non-callee saved XMM reg
#ifdef UNIX_AMD64_ABI
// System V x64 first temp reg is xmm8
regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM8);
#else
// Windows first temp reg is xmm4
regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4);
#endif // UNIX_AMD64_ABI
regNumber frameReg = genFramePointerReg();
regNumber zeroReg = REG_NA;
#ifdef TARGET_64BIT
// Need at least 2x YMM for alignment cut over; however due to alignment 2x is more instructions
// so use a higer cut over point.
unsigned ymmCutOver = YMM_REGSIZE_BYTES * 3;
bool AV2Support = compiler->getSIMDSupportLevel() >= SIMD_AVX2_Supported;

if (blkSize >= XMM_REGSIZE_BYTES && (blkSize < ymmCutOver || !AV2Support))
{
emit->emitIns_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg);
}
else if (blkSize >= YMM_REGSIZE_BYTES)
{
noway_assert(AV2Support);
emit->emitIns_R_R(INS_xorps, EA_32BYTE, zeroSIMDReg, zeroSIMDReg);
}

if (blkSize >= ymmCutOver && AV2Support)
{
// We need to 32 byte align the YMM stores as there is a signifcant penality
// if they cross a page boundary (and a minor one if they cross a cache line)
assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // initReg is not a live
// incoming argument reg
// Get block start
emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, initReg, frameReg, untrLclLo);
// Zero first 32 bytes (may overlap with next 32 byte alignment)
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, untrLclLo);
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg,
untrLclLo + XMM_REGSIZE_BYTES);
// Get next offset for next 32 byte alignment
// Add 32 to hit or go over the next alignment
emit->emitIns_R_I(INS_add, EA_PTRSIZE, initReg, 32);
// Clear all the unaligned bits
emit->emitIns_R_I(INS_and, EA_PTRSIZE, initReg, -32);
// Now do the aligned YMM clears, we do one less so last two are XMM for alignment
noway_assert(blkSize > YMM_REGSIZE_BYTES * 2);
unsigned ymmBlkSize = blkSize - YMM_REGSIZE_BYTES;
for (unsigned regSize = YMM_REGSIZE_BYTES; i + regSize <= ymmBlkSize; i += regSize)
{
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(regSize), zeroSIMDReg, initReg, i);
}

noway_assert(regSet.rsRegsModified(RBM_EDI));
noway_assert(i > YMM_REGSIZE_BYTES);
}
#else // !TARGET_64BIT
if (blkSize >= XMM_REGSIZE_BYTES)
{
emit->emitIns_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg);
}
#endif // TARGET_64BIT
if ((blkSize % XMM_REGSIZE_BYTES) != 0)
{
assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // initReg is not a live
// incoming argument reg
zeroReg = genGetZeroReg(initReg, pInitRegZeroed);
}
for (unsigned regSize = XMM_REGSIZE_BYTES; i + regSize <= blkSize; i += regSize)
{
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(regSize), zeroSIMDReg, frameReg, untrLclLo + i);
}

#ifdef UNIX_AMD64_ABI
// For register arguments we may have to save ECX and RDI on Amd64 System V OSes
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
{
noway_assert(regSet.rsRegsModified(RBM_R12));
inst_RV_RV(INS_mov, REG_R12, REG_RCX);
regSet.verifyRegUsed(REG_R12);
for (; i + REGSIZE_BYTES <= blkSize; i += REGSIZE_BYTES)
{
noway_assert((blkSize % XMM_REGSIZE_BYTES) != 0);
emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i);
}
#ifdef TARGET_64BIT
assert(i == blkSize || (i + sizeof(int) == blkSize));
if (i != blkSize)
{
noway_assert((blkSize % XMM_REGSIZE_BYTES) != 0);
emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, untrLclLo + i);
i += sizeof(int);
}
#endif // TARGET_64BIT
assert(i == blkSize);
}

if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
else
{
noway_assert(regSet.rsRegsModified(RBM_R13));
inst_RV_RV(INS_mov, REG_R13, REG_RDI);
regSet.verifyRegUsed(REG_R13);
}
// Generate the following code:
//
// lea edi, [ebp/esp-OFFS]
// mov ecx, <size>
// xor eax, eax
// rep stosb

unsigned blkSize = (untrLclHi - untrLclLo);
noway_assert(blkSize > XMM_REGSIZE_BYTES * 2);
noway_assert(regSet.rsRegsModified(RBM_EDI));
#ifdef UNIX_AMD64_ABI
// For register arguments we may have to save ECX and RDI on Amd64 System V OSes
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
{
noway_assert(regSet.rsRegsModified(RBM_R12));
inst_RV_RV(INS_mov, REG_R12, REG_RCX);
regSet.verifyRegUsed(REG_R12);
}

if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
{
noway_assert(regSet.rsRegsModified(RBM_R13));
inst_RV_RV(INS_mov, REG_R13, REG_RDI);
regSet.verifyRegUsed(REG_R13);
}
#else // !UNIX_AMD64_ABI
// For register arguments we may have to save ECX
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
{
noway_assert(regSet.rsRegsModified(RBM_ESI));
inst_RV_RV(INS_mov, REG_ESI, REG_ECX);
regSet.verifyRegUsed(REG_ESI);
}
// For register arguments we may have to save ECX
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
{
noway_assert(regSet.rsRegsModified(RBM_ESI));
inst_RV_RV(INS_mov, REG_ESI, REG_ECX);
regSet.verifyRegUsed(REG_ESI);
}
#endif // !UNIX_AMD64_ABI
noway_assert((intRegState.rsCalleeRegArgMaskLiveIn & RBM_EAX) == 0);

noway_assert((intRegState.rsCalleeRegArgMaskLiveIn & RBM_EAX) == 0);

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, genFramePointerReg(), untrLclLo);
regSet.verifyRegUsed(REG_EDI);
emitter* emit = GetEmitter();
regNumber frameReg = genFramePointerReg();
#ifdef TARGET_64BIT
// We need to 32 byte align the rep stosd as there is a penality if it is not 32 byte aligned
// Grab a non-argument, non-callee saved XMM reg
#ifdef UNIX_AMD64_ABI
// System V x64 first temp reg is xmm8
regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM8);
#else // !UNIX_AMD64_ABI
// Windows first temp reg is xmm4
regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4);
#endif // UNIX_AMD64_ABI
// Zero xmm reg
emit->emitIns_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg);
// Get block start
emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, frameReg, untrLclLo);
// Zero first 32 bytes (may overlap with next 32 byte alignment)
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, untrLclLo);
emit->emitIns_AR_R(INS_movdqu, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg,
untrLclLo + XMM_REGSIZE_BYTES);
// Stash current block start
emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_ECX, REG_EDI);
// Add 32 to hit or go over the next alignment
emit->emitIns_R_I(INS_add, EA_PTRSIZE, REG_EDI, 32);
// Clear all the unaligned bits, RDI now contains the aligned address
emit->emitIns_R_I(INS_and, EA_PTRSIZE, REG_EDI, -32);
// Copy the aligned address
emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EAX, REG_EDI);
// Subtract the unaligned address from aligned to get amount to subtract from count
emit->emitIns_R_R(INS_sub, EA_PTRSIZE, REG_EAX, REG_ECX);
// Output count of bytes to clear
inst_RV_IV(INS_mov, REG_ECX, blkSize, EA_4BYTE);
// Subtract the unaligned already cleared
emit->emitIns_R_R(INS_sub, EA_PTRSIZE, REG_ECX, REG_EAX);
#else // !TARGET_64BIT
emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, frameReg, untrLclLo);
#endif // TARGET_64BIT
regSet.verifyRegUsed(REG_EDI);

inst_RV_IV(INS_mov, REG_ECX, (untrLclHi - untrLclLo) / sizeof(int), EA_4BYTE);
instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EAX);
instGen(INS_r_stosd);
instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EAX);
instGen(INS_r_stosb);

#ifdef UNIX_AMD64_ABI
// Move back the argument registers
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
{
inst_RV_RV(INS_mov, REG_RCX, REG_R12);
}
// Move back the argument registers
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
{
inst_RV_RV(INS_mov, REG_RCX, REG_R12);
}

if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
{
inst_RV_RV(INS_mov, REG_RDI, REG_R13);
}
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
{
inst_RV_RV(INS_mov, REG_RDI, REG_R13);
}
#else // !UNIX_AMD64_ABI
// Move back the argument registers
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
{
inst_RV_RV(INS_mov, REG_ECX, REG_ESI);
}
// Move back the argument registers
if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
{
inst_RV_RV(INS_mov, REG_ECX, REG_ESI);
}
#endif // !UNIX_AMD64_ABI

#else // TARGET*
}
#else // TARGET*
#error Unsupported or unset target architecture
#endif // TARGET*
}
Expand Down