Skip to content

Commit

Permalink
SUB reg, reg, reg
Browse files Browse the repository at this point in the history
enable NDD on genCodeForBinary

consolidate TakesLegacyPromotedEvexPrefix logics.

ensure register encoding is correct under legacy-promoted-evex encoding.

Make sure the overflow check is correctly emitted.

simplify the compiler setup logics.

emitInsNddBinary

make sure REX will not be added when EVEX presents.

resolve comment and clean up.

enable more NDD instructions.

bug fixes

enable imul

add emitter unit tests, and fix encoding error for CMOVcc

bug fixes:
1. make sure RWR_RRD_SHF has correct register update mode
2. make sure shift instructions will get correct opcode in RR path.

refactor emitInsBinary

clean up

clean up and refactor some code

Adding updated coredistools.dll built with LLVM 19.1.0

make sure the code size estimation is correct for some apx promoted instructions.

add tuning knob to EVEX.ND feature.

flip the Evex.nd knob.

put NDD control knob to the correct place.
  • Loading branch information
Ruihan-Yin committed Nov 19, 2024
1 parent 42c6cfc commit 2740ca6
Show file tree
Hide file tree
Showing 10 changed files with 1,067 additions and 130 deletions.
166 changes: 148 additions & 18 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,11 +770,19 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);
instruction ins = genGetInsForOper(tree->OperGet(), targetType);

inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (targetReg != operandReg))
{
GetEmitter()->emitIns_R_R(ins, emitTypeSize(operand), targetReg, operandReg, INS_OPTS_EVEX_nd);
}
else
{
inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);

instruction ins = genGetInsForOper(tree->OperGet(), targetType);
inst_RV(ins, targetReg, targetType);
instruction ins = genGetInsForOper(tree->OperGet(), targetType);
inst_RV(ins, targetReg, targetType);
}
}

genProduceReg(tree);
Expand Down Expand Up @@ -1189,12 +1197,49 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
// reg3 = reg3 op reg2
else
{
var_types op1Type = op1->TypeGet();
inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
regSet.verifyRegUsed(targetReg);
gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
dst = treeNode;
src = op2;
if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) && !varTypeIsFloating(treeNode))
{
// TODO-xarch-apx:
// APX can provide optimal code gen in this case using NDD feature:
// reg3 = op1 op op2 without extra mov

// see if it can be optimized by inc/dec
if (oper == GT_ADD && op2->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
{
if (op2->IsIntegralConst(1))
{
emit->emitIns_R_R(INS_inc, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd);
genProduceReg(treeNode);
return;
}
else if (op2->IsIntegralConst(-1))
{
emit->emitIns_R_R(INS_dec, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd);
genProduceReg(treeNode);
return;
}
}

assert(op1reg != targetReg);
assert(op2reg != targetReg);
emit->emitInsBinary(ins, emitTypeSize(treeNode), op1, op2, targetReg);
if (treeNode->gtOverflowEx())
{
assert(oper == GT_ADD || oper == GT_SUB);
genCheckOverflow(treeNode);
}
genProduceReg(treeNode);
return;
}
else
{
var_types op1Type = op1->TypeGet();
inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
regSet.verifyRegUsed(targetReg);
gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
dst = treeNode;
src = op2;
}
}

// try to use an inc or dec
Expand All @@ -1213,6 +1258,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
return;
}
}

regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
noway_assert(r == targetReg);

Expand Down Expand Up @@ -1326,6 +1372,25 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
}
assert(regOp->isUsedFromReg());

if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) && regOp->GetRegNum() != mulTargetReg)
{
// use NDD form to optimize this form:
// mov targetReg, regOp
// imul targetReg, rmOp
// to imul targetReg, regOp rmOp.
emit->emitInsBinary(ins, size, regOp, rmOp, mulTargetReg);
if (requiresOverflowCheck)
{
// Overflow checking is only used for non-floating point types
noway_assert(!varTypeIsFloating(treeNode));

genCheckOverflow(treeNode);
}
genProduceReg(treeNode);
return;
}


// Setup targetReg when neither of the source operands was a matching register
inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true);

Expand Down Expand Up @@ -4873,6 +4938,24 @@ void CodeGen::genCodeForShift(GenTree* tree)
genProduceReg(tree);
return;
}


if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (tree->GetRegNum() != operandReg))
{
ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
// If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
// this case might be rarely hit.
if (shiftByValue == 1)
{
GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd);
}
else
{
GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue, INS_OPTS_EVEX_nd);
}
genProduceReg(tree);
return;
}
#endif
// First, move the operand to the destination register and
// later on perform the shift in-place.
Expand Down Expand Up @@ -4919,6 +5002,15 @@ void CodeGen::genCodeForShift(GenTree* tree)
// The operand to be shifted must not be in ECX
noway_assert(operandReg != REG_RCX);

if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (tree->GetRegNum() != operandReg))
{
// If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
// this case might be rarely hit.
GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd);
genProduceReg(tree);
return;
}

inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true);
inst_RV(ins, tree->GetRegNum(), targetType);
}
Expand Down Expand Up @@ -9064,8 +9156,22 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
genDefineTempLabel(genCreateTempLabel());

// This test suite needs REX2 enabled.
assert(theEmitter->emitComp->DoJitStressRex2Encoding());
// assert(theEmitter->emitComp->DoJitStressRex2Encoding());

theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX);
Expand All @@ -9081,7 +9187,7 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX);
theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX);

theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX);
// theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX);

theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false);
theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false);
Expand All @@ -9101,13 +9207,12 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05);
theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05);
theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05);

theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0);

// JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64.
// // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64.
theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000);

// shf reg, cl
// // shf reg, cl
theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX);
theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX);
theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX);
Expand Down Expand Up @@ -9193,8 +9298,8 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4);
theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4);

// theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 1, 2);
// theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 1, 2);
theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 1, 2);
theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 1, 2);
theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 1, 2);

theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX);
Expand Down Expand Up @@ -9226,8 +9331,8 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5);
// TODO-XArch-apx: S_R_I path only accepts SEE or VEX instructions,
// so I assuem shld/shrd will not be taking the first argument from stack.
// theEmitter->emitIns_S_R_I(INS_shld, EA_2BYTE, 1, 2, REG_EAX, 5);
// theEmitter->emitIns_S_R_I(INS_shrd, EA_2BYTE, 1, 2, REG_EAX, 5);
theEmitter->emitIns_S_R_I(INS_shld, EA_2BYTE, 1, 2, REG_EAX, 5);
theEmitter->emitIns_S_R_I(INS_shrd, EA_2BYTE, 1, 2, REG_EAX, 5);

theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2);

Expand All @@ -9244,6 +9349,31 @@ void CodeGen::genAmd64EmitterUnitTestsApx()

theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX);
theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX);

theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd);
}

#endif // defined(DEBUG) && defined(TARGET_AMD64)
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2297,12 +2297,13 @@ void Compiler::compSetProcessor()
codeGen->GetEmitter()->SetUseEvexEncoding(true);
// TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added.
}
if (canUseRex2Encoding() || DoJitStressRex2Encoding())
if (canUseApxEncodings())
{
// TODO-Xarch-apx:
// At this stage, since no machine will pass the CPUID check for APX, we need a special stress mode that
// enables REX2 on incompatible platform, `DoJitStressRex2Encoding` is expected to be removed eventually.
codeGen->GetEmitter()->SetUseRex2Encoding(true);
codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true);
}
}
#endif // TARGET_XARCH
Expand Down
24 changes: 21 additions & 3 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9946,12 +9946,12 @@ class Compiler
}

//------------------------------------------------------------------------
// canUseRex2Encoding - Answer the question: Is Rex2 encoding supported on this target.
// canUseApxEncodings - Answer the question: are Apx encodings - rex2 and promoted EVEX supported on this target.
//
// Returns:
// `true` if Rex2 encoding is supported, `false` if not.
// `true` if Apx encodings are supported, `false` if not.
//
bool canUseRex2Encoding() const
bool canUseApxEncodings() const
{
#ifdef DEBUG
if (JitConfig.JitBypassAPXCheck())
Expand Down Expand Up @@ -10015,6 +10015,24 @@ class Compiler

return false;
}

//------------------------------------------------------------------------
// DoJitStressPromotedEvexEncoding- Answer the question: Do we force promoted EVEX encoding.
//
// Returns:
// `true` if user requests promoted EVEX encoding.
//
bool DoJitStressPromotedEvexEncoding() const
{
#ifdef DEBUG
if (JitConfig.JitStressPromotedEVEXEncoding())
{
return true;
}
#endif // DEBUG

return false;
}
#endif // TARGET_XARCH

/*
Expand Down
22 changes: 21 additions & 1 deletion src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ class emitter
SetUseVEXEncoding(false);
SetUseEvexEncoding(false);
SetUseRex2Encoding(false);
SetUsePromotedEVEXEncoding(false);
#endif // TARGET_XARCH

emitDataSecCur = nullptr;
Expand Down Expand Up @@ -793,7 +794,10 @@ class emitter
// For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length.
// For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode.
// Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits.
unsigned _idEvexbContext : 2;
unsigned _idCustom5 : 2;

#define _idEvexbContext _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE */
#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
#endif // TARGET_XARCH

#ifdef TARGET_ARM64
Expand Down Expand Up @@ -1728,6 +1732,17 @@ class emitter
assert(!idIsEvexZContextSet());
_idEvexZContext = 1;
}

bool idIsEvexNdContextSet() const
{
return _idEvexNdContext != 0;
}

void idSetEvexNdContext()
{
assert(!idIsEvexNdContextSet());
_idEvexNdContext = 1;
}
#endif

#ifdef TARGET_ARMARCH
Expand Down Expand Up @@ -2531,7 +2546,12 @@ class emitter
CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue);
#endif // FEATURE_MASKED_HW_INTRINSICS
#endif // FEATURE_SIMD

#if defined(TARGET_XARCH)
regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg = REG_NA);
#else
regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src);
#endif
regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2);
void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem);
void emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/emitfmtsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w
IF_DEF(RRD_RRD_CNS, IS_R1_RD|IS_R2_RD, SCNS) // read reg1, read reg2, const
IF_DEF(RWR_RRD_CNS, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, const
IF_DEF(RRW_RRD_CNS, IS_R1_RW|IS_R2_RD, SCNS) // r/w reg1, read reg2, const
IF_DEF(RWR_RRD_SHF, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, shift

IF_DEF(RRD_RRD_RRD, IS_R1_RD|IS_R2_RD|IS_R3_RD, NONE) // read reg1, read reg2, read reg3
IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg1, read reg2, read reg3
Expand Down
Loading

0 comments on commit 2740ca6

Please sign in to comment.