diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index bd80187eea6a4..e8723723cda4c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -433,12 +433,13 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, else { // For section constant, the immediate will be relocatable - GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags)); + GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, + INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags)); } } else { - GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags)); + GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags)); } } regSet.verifyRegUsed(reg); @@ -769,12 +770,18 @@ void CodeGen::genCodeForNegNot(GenTree* tree) { GenTree* operand = tree->gtGetOp1(); assert(operand->isUsedFromReg()); - regNumber operandReg = genConsumeReg(operand); + regNumber operandReg = genConsumeReg(operand); + instruction ins = genGetInsForOper(tree->OperGet(), targetType); - inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); - - instruction ins = genGetInsForOper(tree->OperGet(), targetType); - inst_RV(ins, targetReg, targetType); + if (GetEmitter()->DoJitUseApxNDD(ins) && (targetReg != operandReg)) + { + GetEmitter()->emitIns_R_R(ins, emitTypeSize(operand), targetReg, operandReg, INS_OPTS_EVEX_nd); + } + else + { + inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); + inst_RV(ins, targetReg, targetType); + } } genProduceReg(tree); @@ -1189,12 +1196,49 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // reg3 = reg3 op reg2 else { - var_types op1Type = op1->TypeGet(); - inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false); - regSet.verifyRegUsed(targetReg); - gcInfo.gcMarkRegPtrVal(targetReg, op1Type); - dst = treeNode; - src = op2; + if (emit->DoJitUseApxNDD(ins) && !varTypeIsFloating(treeNode)) + { + // TODO-xarch-apx: + // APX can provide optimal code gen in this case using NDD feature: + // reg3 = op1 op op2 without extra mov + + // see if it can be optimized by inc/dec + if (oper == GT_ADD && op2->isContainedIntOrIImmed() && !treeNode->gtOverflowEx()) + { + if (op2->IsIntegralConst(1)) + { + emit->emitIns_R_R(INS_inc, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd); + genProduceReg(treeNode); + return; + } + else if (op2->IsIntegralConst(-1)) + { + emit->emitIns_R_R(INS_dec, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd); + genProduceReg(treeNode); + return; + } + } + + assert(op1reg != targetReg); + assert(op2reg != targetReg); + emit->emitInsBinary(ins, emitTypeSize(treeNode), op1, op2, targetReg); + if (treeNode->gtOverflowEx()) + { + assert(oper == GT_ADD || oper == GT_SUB); + genCheckOverflow(treeNode); + } + genProduceReg(treeNode); + return; + } + else + { + var_types op1Type = op1->TypeGet(); + inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false); + regSet.verifyRegUsed(targetReg); + gcInfo.gcMarkRegPtrVal(targetReg, op1Type); + dst = treeNode; + src = op2; + } } // try to use an inc or dec @@ -1213,6 +1257,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) return; } } + regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src); noway_assert(r == targetReg); @@ -1326,6 +1371,24 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) } assert(regOp->isUsedFromReg()); + if (emit->DoJitUseApxNDD(ins) && regOp->GetRegNum() != mulTargetReg) + { + // use NDD form to optimize this form: + // mov targetReg, regOp + // imul targetReg, rmOp + // to imul targetReg, regOp rmOp. + emit->emitInsBinary(ins, size, regOp, rmOp, mulTargetReg); + if (requiresOverflowCheck) + { + // Overflow checking is only used for non-floating point types + noway_assert(!varTypeIsFloating(treeNode)); + + genCheckOverflow(treeNode); + } + genProduceReg(treeNode); + return; + } + // Setup targetReg when neither of the source operands was a matching register inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true); @@ -4438,23 +4501,23 @@ void CodeGen::genCodeForLockAdd(GenTreeOp* node) if (imm == 1) { // inc [addr] - GetEmitter()->emitIns_AR(INS_inc, size, addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR(INS_inc_no_evex, size, addr->GetRegNum(), 0); } else if (imm == -1) { // dec [addr] - GetEmitter()->emitIns_AR(INS_dec, size, addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR(INS_dec_no_evex, size, addr->GetRegNum(), 0); } else { // add [addr], imm - GetEmitter()->emitIns_I_AR(INS_add, size, imm, addr->GetRegNum(), 0); + GetEmitter()->emitIns_I_AR(INS_add_no_evex, size, imm, addr->GetRegNum(), 0); } } else { // add [addr], data - GetEmitter()->emitIns_AR_R(INS_add, size, data->GetRegNum(), addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR_R(INS_add_no_evex, size, data->GetRegNum(), addr->GetRegNum(), 0); } } @@ -4481,7 +4544,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* node) if (node->OperIs(GT_XORR, GT_XAND)) { - const instruction ins = node->OperIs(GT_XORR) ? INS_or : INS_and; + const instruction ins = node->OperIs(GT_XORR) ? INS_or_no_evex : INS_and_no_evex; if (node->IsUnusedValue()) { @@ -4873,6 +4936,24 @@ void CodeGen::genCodeForShift(GenTree* tree) genProduceReg(tree); return; } + + if (GetEmitter()->DoJitUseApxNDD(ins) && (tree->GetRegNum() != operandReg)) + { + ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue); + // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov. + // this case might be rarely hit. + if (shiftByValue == 1) + { + GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd); + } + else + { + GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue, + INS_OPTS_EVEX_nd); + } + genProduceReg(tree); + return; + } #endif // First, move the operand to the destination register and // later on perform the shift in-place. @@ -4919,6 +5000,15 @@ void CodeGen::genCodeForShift(GenTree* tree) // The operand to be shifted must not be in ECX noway_assert(operandReg != REG_RCX); + if (GetEmitter()->DoJitUseApxNDD(ins) && (tree->GetRegNum() != operandReg)) + { + // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov. + // this case might be rarely hit. + GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd); + genProduceReg(tree); + return; + } + inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true); inst_RV(ins, tree->GetRegNum(), targetType); } @@ -9270,6 +9360,87 @@ void CodeGen::genAmd64EmitterUnitTestsApx() theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); + + // APX-EVEX + + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_tzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_lzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_popcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_tzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_lzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_popcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, + (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); + + theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); } #endif // defined(DEBUG) && defined(TARGET_AMD64) @@ -11314,7 +11485,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind) if (barrierKind == BARRIER_FULL) { instGen(INS_lock); - GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0); + GetEmitter()->emitIns_I_AR(INS_or_no_evex, EA_4BYTE, 0, REG_SPBASE, 0); } } diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index a3d854e17eff0..0de153935f77e 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2299,6 +2299,7 @@ void Compiler::compSetProcessor() if (canUseApxEncoding()) { codeGen->GetEmitter()->SetUseRex2Encoding(true); + codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true); } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 5160d287cb311..36bca23166a49 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3942,7 +3942,7 @@ class Compiler // false: we can add new tracked variables. // true: We cannot add new 'tracked' variable - bool lvaTrackedFixed = false; + bool lvaTrackedFixed = false; unsigned lvaCount; // total number of locals, which includes function arguments, // special arguments, IL local variables, and JIT temporary variables @@ -6849,7 +6849,7 @@ class Compiler unsigned acdCount = 0; // Get the index to use as part of the AddCodeDsc key for sharing throw blocks - unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg); + unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg); struct AddCodeDscKey { @@ -6857,7 +6857,7 @@ class Compiler AddCodeDscKey(): acdKind(SCK_NONE), acdData(0) {} AddCodeDscKey(SpecialCodeKind kind, BasicBlock* block, Compiler* comp); AddCodeDscKey(AddCodeDsc* add); - + static bool Equals(const AddCodeDscKey& x, const AddCodeDscKey& y) { return (x.acdData == y.acdData) && (x.acdKind == y.acdKind); @@ -9992,13 +9992,30 @@ class Compiler // JitStressEvexEncoding- Answer the question: Is Evex stress knob set // // Returns: - // `true` if user requests REX2 encoding. + // `true` if user requests EVEX encoding. // bool JitStressEvexEncoding() const { #ifdef DEBUG return JitConfig.JitStressEvexEncoding() || JitConfig.JitStressRex2Encoding(); #endif // DEBUG + return false; + } + + //------------------------------------------------------------------------ + // DoJitStressPromotedEvexEncoding- Answer the question: Do we force promoted EVEX encoding. + // + // Returns: + // `true` if user requests promoted EVEX encoding. + // + bool DoJitStressPromotedEvexEncoding() const + { +#ifdef DEBUG + if (JitConfig.JitStressPromotedEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_APX)) + { + return true; + } +#endif // DEBUG return false; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index dc0f977b60862..6c0429cd3a0bf 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -471,6 +471,7 @@ class emitter SetUseVEXEncoding(false); SetUseEvexEncoding(false); SetUseRex2Encoding(false); + SetUsePromotedEVEXEncoding(false); #endif // TARGET_XARCH emitDataSecCur = nullptr; @@ -793,8 +794,15 @@ class emitter // For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length. // For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode. // Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits. - unsigned _idEvexbContext : 2; -#endif // TARGET_XARCH + unsigned _idCustom5 : 1; + unsigned _idCustom6 : 1; + +#define _idEvexbContext \ + (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \ + */ +#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */ +#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */ +#endif // TARGET_XARCH #ifdef TARGET_ARM64 unsigned _idLclVar : 1; // access a local on stack @@ -1657,38 +1665,17 @@ class emitter #ifdef TARGET_XARCH bool idIsEvexbContextSet() const { - return _idEvexbContext != 0; + return idGetEvexbContext() != 0; } void idSetEvexbContext(insOpts instOptions) { assert(!idIsEvexbContextSet()); + assert(idGetEvexbContext() == 0); + unsigned value = static_cast(instOptions & INS_OPTS_EVEX_b_MASK); - switch (instOptions & INS_OPTS_EVEX_b_MASK) - { - case INS_OPTS_EVEX_eb_er_rd: - { - _idEvexbContext = 1; - break; - } - - case INS_OPTS_EVEX_er_ru: - { - _idEvexbContext = 2; - break; - } - - case INS_OPTS_EVEX_er_rz: - { - _idEvexbContext = 3; - break; - } - - default: - { - unreached(); - } - } + _idCustom5 = ((value >> 0) & 1); + _idCustom6 = ((value >> 1) & 1); } unsigned idGetEvexbContext() const @@ -1728,6 +1715,28 @@ class emitter assert(!idIsEvexZContextSet()); _idEvexZContext = 1; } + + bool idIsEvexNdContextSet() const + { + return _idEvexNdContext != 0; + } + + void idSetEvexNdContext() + { + assert(!idIsEvexNdContextSet()); + _idEvexNdContext = 1; + } + + bool idIsEvexNfContextSet() const + { + return _idEvexNfContext != 0; + } + + void idSetEvexNfContext() + { + assert(!idIsEvexNfContextSet()); + _idEvexNfContext = 1; + } #endif #ifdef TARGET_ARMARCH @@ -2531,7 +2540,12 @@ class emitter CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); #endif // FEATURE_MASKED_HW_INTRINSICS #endif // FEATURE_SIMD + +#if defined(TARGET_XARCH) + regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg = REG_NA); +#else regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src); +#endif regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2); void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem); void emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem); diff --git a/src/coreclr/jit/emitfmtsxarch.h b/src/coreclr/jit/emitfmtsxarch.h index f893fce8d07ee..a94a7c1b3e7d5 100644 --- a/src/coreclr/jit/emitfmtsxarch.h +++ b/src/coreclr/jit/emitfmtsxarch.h @@ -140,6 +140,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w IF_DEF(RRD_RRD_CNS, IS_R1_RD|IS_R2_RD, SCNS) // read reg1, read reg2, const IF_DEF(RWR_RRD_CNS, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, const IF_DEF(RRW_RRD_CNS, IS_R1_RW|IS_R2_RD, SCNS) // r/w reg1, read reg2, const +IF_DEF(RWR_RRD_SHF, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, shift IF_DEF(RRD_RRD_RRD, IS_R1_RD|IS_R2_RD|IS_R3_RD, NONE) // read reg1, read reg2, read reg3 IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg1, read reg2, read reg3 diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 70f54f021c937..e1b352b39edc5 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -236,6 +236,18 @@ bool emitter::HasRex2Encoding(instruction ins) const return (flags & Encoding_REX2) != 0; } +bool emitter::HasApxNdd(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_Has_NDD) != 0; +} + +bool emitter::HasApxNf(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_Has_NF) != 0; +} + bool emitter::IsVexEncodableInstruction(instruction ins) const { if (!UseVEXEncoding()) @@ -286,6 +298,8 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const // bool emitter::IsRex2EncodableInstruction(instruction ins) const { + // TODO-Xarch-apx: we have special stress mode for REX2 on non-compatible machine, that will + // force UseRex2Encoding return true regardless of the CPUID results. if (!UseRex2Encoding()) { return false; @@ -293,6 +307,106 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const return HasRex2Encoding(ins); } +//------------------------------------------------------------------------ +// IsApxNDDEncodableInstruction: Answer the question- does this instruction have apx ndd form. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins has apx ndd form. +// +bool emitter::IsApxNDDEncodableInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + return HasApxNdd(ins); +} + +//------------------------------------------------------------------------ +// IsApxNFEncodableInstruction: Answer the question - does this instruction have Evex.nf supported +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins is Evex.nf supported. +// +bool emitter::IsApxNFEncodableInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + return HasApxNf(ins); +} + +//------------------------------------------------------------------------ +// IsApxExtendedEvexInstruction: Answer the question - does this instruction have apx extended evex form. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins has apx extended evex form. +// +bool emitter::IsApxExtendedEvexInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + return HasApxNdd(ins) || HasApxNf(ins); +} + +//------------------------------------------------------------------------ +// IsShiftInstruction: Answer the question- is this instruction a shift instruction. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins is a shift instruction. +// +bool emitter::IsShiftInstruction(instruction ins) const +{ + switch (ins) + { + case INS_rcl_1: + case INS_rcr_1: + case INS_rol_1: + case INS_ror_1: + case INS_shl_1: + case INS_shr_1: + case INS_sar_1: + + case INS_rcl: + case INS_rcr: + case INS_rol: + case INS_ror: + case INS_shl: + case INS_shr: + case INS_sar: + + case INS_rcl_N: + case INS_rcr_N: + case INS_rol_N: + case INS_ror_N: + case INS_shl_N: + case INS_shr_N: + case INS_sar_N: + return true; + + default: + return false; + } +} + //------------------------------------------------------------------------ // IsLegacyMap1: Answer the question- Is this instruction on legacy-map-1 // @@ -324,7 +438,7 @@ bool emitter::IsLegacyMap1(code_t code) const if ((code & 0xFF00FF00) == 0x0F000000) { - // 4-byte, need to check if PP is a prefix. + // 4-byte, need to check if PP is prefixs BYTE prefix = (BYTE)((code & 0xFF0000) >> 16); return ((prefix == 0xF2) || (prefix == 0xF3) || (prefix == 0x66)); } @@ -647,6 +761,24 @@ bool emitter::IsRexW1EvexInstruction(instruction ins) return false; } +//------------------------------------------------------------------------ +// DoJitUseApxNDD: Answer the question: does JIT use APX NDD feature on the given instruction? +// +// Arguments: +// ins - instruction to test +// +// Return Value: +// true if JIT allows APX NDD to be applied on the instructions. +// +bool emitter::DoJitUseApxNDD(instruction ins) const +{ +#if !defined(TARGET_AMD64) + return false; +#else + return JitConfig.EnableApxNDD() && IsApxNDDEncodableInstruction(ins); +#endif +} + #ifdef TARGET_64BIT //------------------------------------------------------------------------ // AreUpperBitsZero: check if some previously emitted @@ -1257,6 +1389,179 @@ insOpts emitter::GetEmbRoundingMode(uint8_t mode) const } } +//------------------------------------------------------------------------ +// emitHandleGCrefRegs: Update GC ref related registers' liveness. +// +// Arguments: +// dst - Destination buffer. +// id - instruction descriptor to the GC ref instruction. +// +void emitter::emitHandleGCrefRegs(BYTE* dst, instrDesc* id) +{ + regNumber reg1 = id->idReg1(); // dst and src1 + regNumber reg2 = id->idReg2(); // src2 + switch (id->idInsFmt()) + { + case IF_RRD_RRD: + break; + + case IF_RWR_RRD: + { + if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0) + { + // We're relocating "this" in the prolog + assert(emitComp->lvaIsOriginalThisArg(0)); + assert(emitComp->lvaTable[0].lvRegister); + assert(emitComp->lvaTable[0].GetRegNum() == reg1); + + if (emitFullGCinfo) + { + emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true); + break; + } + else + { + /* If emitFullGCinfo==false, the we don't use any + regPtrDsc's and so explicitly note the location + of "this" in GCEncode.cpp + */ + } + } + + emitGCregLiveUpd(id->idGCref(), reg1, dst); + break; + } + + case IF_RRW_RRD: + case IF_RWR_RRD_RRD: + { + regNumber targetReg = reg1; // dst + + // if the instructions is encoded in NDD form, + // src registers will be the 2nd and 3rd register on id. + if (id->idInsFmt() == IF_RWR_RRD_RRD) + { + reg1 = id->idReg2(); // src1 + reg2 = id->idReg3(); // src2 + } + + switch (id->idIns()) + { + /* + This must be one of the following cases: + + xor reg, reg to assign NULL + + and r1 , r2 if (ptr1 && ptr2) ... + or r1 , r2 if (ptr1 || ptr2) ... + + add r1 , r2 to compute a normal byref + sub r1 , r2 to compute a strange byref (VC only) + + */ + case INS_xor: + assert(reg1 == reg2); + emitGCregLiveUpd(id->idGCref(), targetReg, dst); + break; + + case INS_or: + case INS_and: + emitGCregDeadUpd(targetReg, dst); + break; + + case INS_add: + case INS_sub: + case INS_sub_hide: + assert(id->idGCref() == GCT_BYREF); + +#if 0 +#ifdef DEBUG + // Due to elided register moves, we can't have the following assert. + // For example, consider: + // t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx + // /--* t85 byref + // * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx + // Here, V01 is type `long` on entry, then is stored as a byref. But because + // the register allocator assigned the same register, no instruction was + // generated, and we only (currently) make gcref/byref changes in emitter GC info + // when an instruction is generated. We still generate correct GC info, as this + // instruction, if writing a GC ref even through reading a long, will go live here. + // These situations typically occur due to unsafe casting, such as with Span. + + regMaskTP regMask; + regMask = genRegMask(reg1) | genRegMask(reg2); + + // r1/r2 could have been a GCREF as GCREF + int=BYREF + // or BYREF+/-int=BYREF + assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) || + ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide))); +#endif // DEBUG +#endif // 0 + + // Mark r1 as holding a byref + emitGCregLiveUpd(GCT_BYREF, targetReg, dst); + break; + + default: +#ifdef DEBUG + emitDispIns(id, false, false, false); +#endif + assert(!"unexpected GC reg update instruction"); + } + + break; + } + + case IF_RRW_RRW: + { + // This must be "xchg reg1, reg2" + assert(id->idIns() == INS_xchg); + + // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC + // register pointer mask. + + GCtype gc1, gc2; + + gc1 = emitRegGCtype(reg1); + gc2 = emitRegGCtype(reg2); + + if (gc1 != gc2) + { + // Kill the GC-info about the GC registers + + if (needsGC(gc1)) + { + emitGCregDeadUpd(reg1, dst); + } + + if (needsGC(gc2)) + { + emitGCregDeadUpd(reg2, dst); + } + + // Now, swap the info + + if (needsGC(gc1)) + { + emitGCregLiveUpd(gc1, reg2, dst); + } + + if (needsGC(gc2)) + { + emitGCregLiveUpd(gc2, reg1, dst); + } + } + break; + } + + default: +#ifdef DEBUG + emitDispIns(id, false, false, false); +#endif + assert(!"unexpected GC ref instruction format"); + } +} + //------------------------------------------------------------------------ // encodeRegAsIval: Encodes a register as an ival for use by a SIMD instruction // @@ -1343,9 +1648,23 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const return true; } + if (id->idIsEvexNfContextSet() && IsBMIInstruction(ins)) + { + // Only a few BMI instructions shall be promoted to APX-EVEX due to NF feature. + // TODO-XArch-APX: convert the check into forms like Has* as above. + return true; + } + #if defined(DEBUG) if (emitComp->DoJitStressEvexEncoding()) { + if (IsBMIInstruction(ins)) + { + // The Encoding_EVEX on some BMI instructions is tagged due to APX, + // they cannot be stressed with JitStressEvexEncoding. + return false; + } + // Requires the EVEX encoding due to STRESS mode and no change in semantics // // Some instructions, like VCMPEQW return the value in a SIMD register for @@ -1354,6 +1673,12 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const // check above so we need to still return false here to preserve semantics. return !HasKMaskRegisterDest(ins); } + + if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding()) + { + // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX. + return (IsBMIInstruction(ins) && HasApxNf(ins)); + } #endif // DEBUG if ((ins == INS_pslldq) || (ins == INS_psrldq)) @@ -1408,6 +1733,52 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const return false; } +//------------------------------------------------------------------------ +// TakesApxExtendedEvexPrefix: Checks if the instruction should be legacy-promoted-EVEX encoded. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction requires a legacy-promoted-EVEX prefix. +// +bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const +{ + // TODO-XArch-APX: + // Isolating legacy-promoted-EVEX case out from VEX/EVEX-promoted-EVEX, + // as the latter ones are relatively simple, providing EGPRs functionality, + instruction ins = id->idIns(); + if (!IsApxExtendedEvexInstruction(ins)) + { + return false; + } + + if (IsAvx512OrPriorInstruction(ins)) + { + // This check should reject any instruction not from legacy map-0 or 1. + return false; + } + + if (id->idIsEvexNdContextSet()) + { + return true; + } + + if (id->idIsEvexNfContextSet()) + { + return true; + } + +#if defined(DEBUG) + if (emitComp->DoJitStressPromotedEvexEncoding()) + { + return true; + } +#endif // DEBUG + + return false; +} + // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section // 2.6. // Add base EVEX prefix without setting W, R, X, or B bits @@ -1442,6 +1813,10 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL #define ZBIT_IN_BYTE_EVEX_PREFIX 0x0000008000000000ULL +#define MAP4_IN_BYTE_EVEX_PREFIX 0x4000000000000ULL +#define ND_BIT_IN_BYTE_EVEX_PREFIX 0x1000000000ULL +#define NF_BIT_IN_BYTE_EVEX_PREFIX 0x400000000ULL +#define EXTENDED_EVEX_PP_BITS 0x10000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX prefix with only LL' bits set. // @@ -1451,12 +1826,23 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const // attr -- operand size // // Return Value: -// encoded code with Evex prefix. +// encoded code with EVEX prefix. // emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAttr attr) { // Only AVX512 instructions require EVEX prefix - assert(IsEvexEncodableInstruction(id->idIns())); + // After APX, some instructions in legacy or VEX space will be promoted to EVEX. + instruction ins = id->idIns(); + assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + + if (instrIsExtendedReg3opImul(ins)) + { + // the only case imul(0x68) will need EVEX prefix is EVEX.NF feature enabled. + // imul(0x68) opcode comes with ModR/M.REG byte to indicate implicit register use, + // when it is using extended registers (>= REG_R8), it comes with built-in REX prefix, + // remove them first and add the counter part in EVEX. + code &= 0xFFFFFFFF; + } // Shouldn't have already added EVEX prefix assert(!hasEvexPrefix(code)); @@ -1465,6 +1851,48 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt code |= DEFAULT_BYTE_EVEX_PREFIX; + if (IsApxExtendedEvexInstruction(ins)) + { + if (!HasEvexEncoding(ins)) + { + // Legacy-promoted insutrcions are not labeled with Encoding_EVEX. + code |= MAP4_IN_BYTE_EVEX_PREFIX; + } + + // TODO-XArch-APX: + // verify if it is actually safe to reuse the EVEX.ND with EVEX.B on instrDesc. + if (id->idIsEvexNdContextSet()) + { + code |= ND_BIT_IN_BYTE_EVEX_PREFIX; + } + + if (id->idIsEvexNfContextSet()) + { + code |= NF_BIT_IN_BYTE_EVEX_PREFIX; + } + + if (attr == EA_2BYTE) + { + code |= EXTENDED_EVEX_PP_BITS; + } + + if (instrIsExtendedReg3opImul(ins)) + { + // EVEX.R3 + // TODO-XArch-APX: + // A few side notes: based on how JIT defined IMUL, we may need to extend + // the definition to `IMUL_31` to cover EGPRs. And it can be defined in a + // similar way that opcodes comes with built-in REX2 prefix, and convert + // it to EVEX when needed with some helper functions. + code &= 0xFF7FFFFFFFFFFFFFULL; + } + + return code; + } + + // No APX-NDD instructions should reach code below. + assert(!IsApxExtendedEvexInstruction(ins)); + if (attr == EA_32BYTE) { // Set EVEX.L'L bits to 01 in case of instructions that operate on 256-bits. @@ -2008,6 +2436,14 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) } } #ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + // If the instruction is not VEX/EVEX encodable, and has EVEX prefix, + // then it is legacy promoted EVEX. + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + return emitter::code_t(code | 0x0000800000000000ULL); + } else if (hasRex2Prefix(code)) { return emitter::code_t(code | 0x000800000000ULL); @@ -2046,13 +2482,18 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) return code & 0xFF7FFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + // R-bit is added in bit-inverted form. + return code & 0xFF7FFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50400000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4400000000ULL; } @@ -2082,13 +2523,18 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) return code & 0xFFBFFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + // X-bit is added in bit-inverted form. + return code & 0xFFBFFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50200000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4200000000ULL; } @@ -2118,13 +2564,17 @@ emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) return code & 0xFFDFFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(IsApxExtendedEvexInstruction(ins)); + // R-bit is added in bit-inverted form. + return code & 0xFFDFFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50100000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4100000000ULL; } @@ -2207,7 +2657,7 @@ bool isPrefix(BYTE b) // emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) const { - assert(IsEvexEncodableInstruction(ins)); + assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); code_t evexPrefix = (code >> 32) & 0xFFFFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -2233,6 +2683,14 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x66: { // None of the existing BMI instructions should be EVEX encoded. + // After APX, BMI instructions can be EVEX encoded with NF feature. + if (IsBMIInstruction(ins)) + { + // if BMI instructions reaches this part, then it should be APX-EVEX. + // although the opcode of all the BMI instructions are defined with 0x66, + // but it should not, skip this check. + break; + } assert(!IsBMIInstruction(ins)); evexPrefix |= (0x01 << 8); break; @@ -2298,6 +2756,12 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x0F: { + if (((evexPrefix >> 16) & 0x07) == 0x04) + { + // MAP index equal to 4 indicates this instruction is a promoted legacy instruction. + // the MAP ID has been set when EVEX prefix is added. + break; + } evexPrefix |= (0x01 << 16); break; } @@ -2758,6 +3222,11 @@ unsigned emitter::emitGetRexPrefixSize(instrDesc* id, instruction ins) return 0; } + if (TakesApxExtendedEvexPrefix(id)) + { + return 0; + } + if (TakesRex2Prefix(id)) { return 0; @@ -2868,10 +3337,20 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const adjustedSize++; } #ifdef TARGET_AMD64 - else if (IsRex2EncodableInstruction(ins)) + else if (IsRex2EncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { unsigned prefixAdjustedSize = 0; - if (TakesRex2Prefix(id)) + if (TakesApxExtendedEvexPrefix(id)) + { + prefixAdjustedSize = 4; + // If the opcode will be prefixed by EVEX, then all the map-1-legacy instructions can remove the escape + // prefix + if (IsLegacyMap1(code)) + { + prefixAdjustedSize -= 1; + } + } + else if (TakesRex2Prefix(id)) { prefixAdjustedSize = 2; // If the opcode will be prefixed by REX2, then all the map-1-legacy instructions can remove the escape @@ -2882,15 +3361,14 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const } } - adjustedSize = prefixAdjustedSize; - emitAttr attr = id->idOpSize(); - - if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx)) + if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx) && !TakesApxExtendedEvexPrefix(id)) { // Most 16-bit operand instructions will need a 0x66 prefix. - adjustedSize++; + prefixAdjustedSize++; } + + adjustedSize = prefixAdjustedSize; } #endif // TARGET_AMD64 else @@ -2942,6 +3420,14 @@ unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexP if (includeRexPrefixSize && hasRexPrefix(code)) { + if (instrIsExtendedReg3opImul(id->idIns()) && TakesApxExtendedEvexPrefix(id)) + { + // there is a special case when calculating the size of IMUL with APX-EVEX, + // IMUL_08 or beyond will have a built-in REX prefix with its opcode, + // so it will hit this branch, but when IMUL is encoded with APX-EVEX, + // the size of REX is included in the prefix size, where should be calculated outside. + return 0; + } return 1; } @@ -3583,7 +4069,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX + *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3623,7 +4109,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi } if (false /*reg >= REG_R16 && reg <= REG_R31*/) { - // seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. + // Seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. assert(TakesRex2Prefix(id)); *code |= 0x004000000000ULL; // REX2.R4 } @@ -3632,7 +4118,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX + *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3652,7 +4138,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber instruction ins = id->idIns(); assert(reg < REG_STK); - assert(IsVexOrEvexEncodableInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); assert(hasVexOrEvexPrefix(code)); // Get 4-bit register encoding @@ -3699,6 +4185,25 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber return code ^ regBits; } } + else + { + assert(TakesApxExtendedEvexPrefix(id)); + assert(hasEvexPrefix(code)); +#if defined(TARGET_AMD64) + // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. + // Rather see these paths cleaned up. + regBits = HighAwareRegEncoding(reg); + + if (false /*reg >= REG_R16 && reg <= REG_R31*/) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif + // Shift count = 5-bytes of opcode + 0-2 bits for EVEX + regBits <<= 43; + return code ^ regBits; + } return code ^ regBits; } @@ -3734,7 +4239,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod } if (false /*reg >= REG_R16 && reg <= REG_R31*/) { - // seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. + // Separate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. assert(TakesRex2Prefix(id)); *code |= 0x002000000000ULL; // REX2.X4 } @@ -4130,7 +4635,9 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) if ((code & 0xFF00) != 0) { - sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(id, code, includeRexPrefixSize) : 5; + sz += (IsAvx512OrPriorInstruction(ins) || TakesApxExtendedEvexPrefix(id)) + ? emitInsSize(id, code, includeRexPrefixSize) + : 5; } else { @@ -4258,7 +4765,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -4302,7 +4809,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -4469,7 +4976,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -5414,17 +5921,37 @@ void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommo // attr - the instruction operand size // dst - the destination and first source operand // src - the second source operand +// targetReg - target register of this binary node (only used for APX-NDD form) // // Assumptions: // i) caller of this routine needs to call genConsumeReg() // ii) caller of this routine needs to call genProduceReg() -regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src) +regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg) { // We can only have one memory operand and only src can be a constant operand // However, the handling for a given operand type (mem, cns, or other) is fairly // consistent regardless of whether they are src or dst. As such, we will find // the type of each operand and only check them against src/dst where relevant. + const bool useNDD = UsePromotedEVEXEncoding() && (targetReg != REG_NA); +#if !defined(TARGET_AMD64) + // APX does not support 32-bit system. + assert(!useNDD); +#else + if (useNDD) + { + assert(IsApxNDDEncodableInstruction(ins)); + // targetReg has to be an actual register if using NDD. + assert(targetReg < REG_STK); + // make sure target register is not either of the src registers. + assert(dst->isUsedFromReg()); + regNumber dstreg = dst->GetRegNum(); + regNumber srcreg = src->isUsedFromReg() ? src->GetRegNum() : REG_NA; + assert(targetReg != dstreg); + assert(targetReg != srcreg); + } +#endif + GenTree* memOp = nullptr; GenTree* cnsOp = nullptr; GenTree* otherOp = nullptr; @@ -5436,6 +5963,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(dst->isUsedFromMemory() || (dst->GetRegNum() == REG_NA) || instrIs3opImul(ins)); assert(!src->isUsedFromMemory()); + // APX code cannot hit this path. + assert(!useNDD); + memOp = dst; if (src->isContained()) @@ -5543,6 +6073,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(otherOp == nullptr); assert(src->IsCnsIntOrI()); + // APX code cannot hit this path. + assert(!useNDD); + id = emitNewInstrAmdCns(attr, memIndir->Offset(), (int)src->AsIntConCommon()->IconValue()); } else @@ -5560,6 +6093,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(id != nullptr); id->idIns(ins); // Set the instruction. + if (useNDD) + { + assert(memOp == src); + id->idReg1(targetReg); + id->idReg2(dst->GetRegNum()); + id->idSetEvexNdContext(); + } // Determine the instruction format insFormat fmt = IF_NONE; @@ -5575,12 +6115,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - fmt = emitInsModeFormat(ins, IF_RRD_ARD); + fmt = useNDD ? emitInsModeFormat(ins, IF_RWR_RRD_ARD) : emitInsModeFormat(ins, IF_RRD_ARD); } } else { assert(memOp == dst); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5619,6 +6160,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G else { assert(memOp == dst); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5641,7 +6183,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G dispIns(id); emitCurIGsize += sz; - return (memOp == src) ? dst->GetRegNum() : REG_NA; + return (memOp == src) ? (useNDD ? targetReg : dst->GetRegNum()) : REG_NA; } } } @@ -5689,15 +6231,24 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - // src is a stack based local variable - // dst is a register - emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset); + if (useNDD) + { + emitIns_R_R_S(ins, attr, targetReg, dst->GetRegNum(), varNum, offset, INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + // src is a stack based local variable + // dst is a register + emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset); + } } } else { assert(memOp == dst); assert((dst->GetRegNum() == REG_NA) || dst->IsRegOptional()); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5729,10 +6280,20 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G { assert(!dst->isContained()); GenTreeIntConCommon* intCns = src->AsIntConCommon(); - emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue()); + + if (useNDD) + { + emitIns_R_R_I(ins, attr, targetReg, dst->GetRegNum(), (int)intCns->IconValue(), INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue()); + } } else { + assert(!useNDD); assert(src->IsCnsFltOrDbl()); GenTreeDblCon* dblCns = src->AsDblCon(); @@ -5751,7 +6312,15 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum()); + if (useNDD) + { + emitIns_R_R_R(ins, attr, targetReg, dst->GetRegNum(), src->GetRegNum(), INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum()); + } } } @@ -5902,7 +6471,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI * Add an instruction referencing a single register. */ -void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) +void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions /* = INS_OPTS_NONE */) { emitAttr size = EA_SIZE(attr); @@ -5978,6 +6547,8 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idInsFmt(fmt); id->idReg1(reg); + SetEvexNfIfNeeded(id, instOptions); + // Vex bytes sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); @@ -6050,10 +6621,11 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu * Add an instruction referencing a register and a constant. */ -void emitter::emitIns_R_I(instruction ins, - emitAttr attr, - regNumber reg, - ssize_t val DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags)) +void emitter::emitIns_R_I(instruction ins, + emitAttr attr, + regNumber reg, + ssize_t val, + insOpts instOptions DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags)) { emitAttr size = EA_SIZE(attr); @@ -6193,6 +6765,8 @@ void emitter::emitIns_R_I(instruction ins, id->idDebugOnlyInfo()->idMemCookie = targetHandle; #endif + SetEvexNfIfNeeded(id, instOptions); + if (isSimdInsAndValInByte) { bool includeRexPrefixSize = true; @@ -6206,8 +6780,14 @@ void emitter::emitIns_R_I(instruction ins, sz += emitInsSize(id, insCodeMI(ins), includeRexPrefixSize); } - sz += emitGetAdjustedSize(id, insCodeMI(ins)); +#ifdef TARGET_AMD64 + if (reg == REG_EAX && !instrIs3opImul(ins) && TakesApxExtendedEvexPrefix(id)) + { + // ACC form is not promoted into EVEX space, need to emit with MI form. + sz += 1; + } +#endif // TARGET_AMD64 // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target @@ -6981,6 +7561,14 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum id->idReg1(reg1); id->idReg2(reg2); + SetEvexNdIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + id->idInsFmt(IF_RWR_RRD); + } + if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE) { // if EVEX.b needs to be set in this path, then it should be embedded rounding. @@ -7034,6 +7622,30 @@ void emitter::emitIns_R_R_I( assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // need to fix the instruction opcode for legacy instructions as they has different opcode for RI form. + code = insCodeMI(ins); + // need to fix the instructions format for NDD legacy instructions. + insFormat fmt; + switch (ins) + { + case INS_shl_N: + case INS_shr_N: + case INS_sar_N: + case INS_ror_N: + case INS_rol_N: + fmt = IF_RWR_RRD_SHF; + break; + + default: + fmt = IF_RWR_RRD_CNS; + break; + } + id->idInsFmt(fmt); + } UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival); id->idCodeSize(sz); @@ -7045,7 +7657,7 @@ void emitter::emitIns_R_R_I( void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs) { assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta || - ins == INS_inc || ins == INS_dec); + ins == INS_inc || ins == INS_dec || ins == INS_inc_no_evex || ins == INS_dec_no_evex); instrDesc* id = emitNewInstrAmd(attr, offs); @@ -7398,8 +8010,8 @@ void emitter::emitIns_R_R_C(instruction ins, void emitter::emitIns_R_R_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, insOpts instOptions) { - assert(IsAvx512OrPriorInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins)); instrDesc* id = emitNewInstr(attr); id->idIns(ins); @@ -7415,6 +8027,14 @@ void emitter::emitIns_R_R_R( id->idSetEvexbContext(instOptions); } SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // need to fix the instructions format for NDD legacy instructions. + id->idInsFmt(IF_RWR_RRD_RRD); + } UNATIVE_OFFSET sz = emitInsSizeRR(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -7426,8 +8046,8 @@ void emitter::emitIns_R_R_R( void emitter::emitIns_R_R_S( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions) { - assert(IsAvx512OrPriorInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsApxExtendedEvexInstruction(ins)); instrDesc* id = emitNewInstr(attr); @@ -7439,6 +8059,12 @@ void emitter::emitIns_R_R_S( SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + id->idInsFmt(IF_RWR_RRD_SRD); + } #ifdef DEBUG id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; @@ -9689,6 +10315,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int va SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs); id->idCodeSize(sz); @@ -11393,6 +12020,13 @@ void emitter::emitDispEmbRounding(instrDesc* id) const { return; } + + if (IsApxExtendedEvexInstruction(id->idIns())) + { + // Apx-Evex.nd shared the same bit(s) with Evex.b, + // for ndd case, we don't need to display any thing special. + return; + } assert(!id->idHasMem()); unsigned roundingMode = id->idGetEvexbContext(); if (roundingMode == 1) @@ -11573,6 +12207,14 @@ void emitter::emitDispIns( /* Display the instruction name */ +#ifdef TARGET_AMD64 + if (IsApxNFEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet()) + { + // print the EVEX.NF indication in psudeo prefix style. + printf("{nf} "); + } +#endif // TARGET_AMD64 + sstr = codeGen->genInsDisplayName(id); printf(" %-9s", sstr); @@ -12323,6 +12965,20 @@ void emitter::emitDispIns( break; } + case INS_rol: + case INS_ror: + case INS_rcl: + case INS_rcr: + case INS_shl: + case INS_shr: + case INS_sar: + { + printf("%s", emitRegName(id->idReg1(), attr)); + printf(", %s", emitRegName(id->idReg2(), attr)); + emitDispShift(ins, (BYTE)0); + break; + } + default: { printf("%s", emitRegName(id->idReg1(), attr)); @@ -12340,8 +12996,8 @@ void emitter::emitDispIns( case IF_RRW_RRD_RRD: case IF_RWR_RWR_RRD: { - assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins)); regNumber reg2 = id->idReg2(); regNumber reg3 = id->idReg3(); @@ -12568,6 +13224,19 @@ void emitter::emitDispIns( break; } + case IF_RWR_RRD_SHF: + { + assert(IsApxExtendedEvexInstruction(id->idIns())); + printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr)); + + emitGetInsCns(id, &cnsVal); + val = cnsVal.cnsVal; + + emitDispShift(ins, (BYTE)val); + + break; + } + case IF_RRD_MRD: case IF_RWR_MRD: case IF_RRW_MRD: @@ -13516,12 +14185,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) break; case EA_2BYTE: - - /* Output a size prefix for a 16-bit operand */ - - dst += emitOutputByte(dst, 0x66); - + { + // Output a size prefix for a 16-bit operand + if (TakesApxExtendedEvexPrefix(id)) + { + assert(IsApxExtendedEvexInstruction(ins)); + assert(hasEvexPrefix(code)); + // Evex.pp should already be added when adding the prefix. + assert((code & EXTENDED_EVEX_PP_BITS) != 0); + } + else + { + dst += emitOutputByte(dst, 0x66); + } FALLTHROUGH; + } case EA_4BYTE: #ifdef TARGET_AMD64 @@ -13565,7 +14243,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -14103,6 +14781,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); break; + case IF_RWR_RRD_ARD: + assert(((id->idGCref() == GCT_BYREF) && + (ins == INS_add || ins == INS_sub || ins == INS_sub_hide || insIsCMOV(ins))) || + ((id->idGCref() == GCT_GCREF) && insIsCMOV(ins))); + assert(id->idIsEvexNdContextSet()); + emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); + break; + case IF_ARD_RRD: case IF_AWR_RRD: break; @@ -14349,25 +15035,45 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) switch (size) { case EA_1BYTE: +#ifdef TARGET_AMD64 + assert((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex)); +#endif // TARGET_AMD64 break; case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); + { + if (!TakesApxExtendedEvexPrefix(id)) + { + dst += emitOutputByte(dst, 0x66); + } + } FALLTHROUGH; case EA_4BYTE: + code |= 0x01; + break; + #ifdef TARGET_AMD64 case EA_8BYTE: -#endif // TARGET_AMD64 - /* Set the 'w' size bit to indicate 32-bit operation * Note that incrementing "code" for INS_call (0xFF) would * overflow, whereas setting the lower bit to 1 just works out */ - - code |= 0x01; - break; + { + if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + code = AddRexWPrefix(id, code); + } + if ((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex)) + // These instructions do not support 1-byte inputs and the opcode is exact. + { + code |= 0x01; + } + break; + } +#endif // TARGET_AMD64 #ifdef TARGET_X86 case EA_8BYTE: @@ -14401,7 +15107,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -14455,7 +15161,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -14602,6 +15308,15 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); break; + case IF_RWR_RRD_SRD: // Register Read/Write, Stack Read (So we need to update GC live for register) + + // reg could have been a GCREF as GCREF + int=BYREF + // or BYREF+/-int=BYREF + assert(id->idGCref() == GCT_BYREF && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)); + assert(id->idIsEvexNdContextSet()); + emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); + break; + case IF_SRW_CNS: case IF_SRW_RRD: case IF_SRW_RRW: @@ -15187,7 +15902,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Can't use the compact form, use the long form ins = (instruction)(ins + 1); - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); @@ -15200,10 +15915,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRex2Prefix(id)) - { - code = AddRex2Prefix(ins, code); - } + code = AddX86PrefixIfNeeded(id, code, size); if (TakesRexWPrefix(id)) { @@ -15338,23 +16050,22 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) default: assert(id->idGCref() == GCT_NONE); - - code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); + code = insCodeMR(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, reg, size, code); if (size != EA_1BYTE) { // Set the 'w' bit to get the large version code |= 0x1; - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); } } - code = AddX86PrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); @@ -15491,7 +16202,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } #ifdef FEATURE_HW_INTRINSICS else if ((ins == INS_bsf) || (ins == INS_bsr) || (ins == INS_crc32) || (ins == INS_lzcnt) || (ins == INS_popcnt) || - (ins == INS_tzcnt)) + (ins == INS_tzcnt) +#ifdef TARGET_AMD64 + || (ins == INS_lzcnt_evex) || (ins == INS_tzcnt_evex) || (ins == INS_popcnt_evex) +#endif // TARGET_AMD64 + ) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); @@ -15502,7 +16217,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) code |= 0x0100; } - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { assert(ins == INS_crc32); dst += emitOutputByte(dst, 0x66); @@ -15515,15 +16230,21 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) #endif // FEATURE_HW_INTRINSICS else { + // TODO-XArch-APX: + // some instructions with NDD form might go into this path with EVEX prefix. + // might consider having a separate path with checks like: TakesApxExtendedEvexPrefix + // essentially, we need to make it clear on the priority and necessity of REX2 and EVEX: + // REX2 is needed iff EGPRs are involved. + // EVEX is needed when NDD, NF or other features are involved. + // So the logic should be: + // checking if those new features are used, then check if EGPRs are involved. + // EGPRs will be supported by EVEX anyway, so don't need to check in the first place. assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); - if (TakesRex2Prefix(id)) - { - code = AddRex2Prefix(ins, code); - } + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeMRreg(id, code); - if (ins != INS_test) + if (ins != INS_test && !IsShiftInstruction(ins)) { code |= 2; } @@ -15537,7 +16258,17 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); + if (TakesApxExtendedEvexPrefix(id)) + { + assert(IsApxExtendedEvexInstruction(ins)); + assert(hasEvexPrefix(code)); + // Evex.pp should already be added when adding the prefix. + assert((code & EXTENDED_EVEX_PP_BITS) != 0); + } + else + { + dst += emitOutputByte(dst, 0x66); + } FALLTHROUGH; case EA_4BYTE: @@ -15588,8 +16319,18 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); - regCode |= insEncodeReg012(id, regFor012Bits, size, &code); + unsigned regCode; + if (!id->idIsEvexNdContextSet() || !IsApxNDDEncodableInstruction(ins)) + { + regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); + } + else + { + // unary ins with NDD form use Evex.vvvvv for dst, and ModRM.rm for src + code = insEncodeReg3456(id, reg1, size, code); + regCode = insEncodeReg012(id, reg2, size, &code); + } if (TakesSimdPrefix(id)) { @@ -15647,6 +16388,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); } + else if (IsApxNDDEncodableInstruction(ins) && id->idIsEvexNdContextSet()) + { + dst += emitOutputByte(dst, (code & 0xFF)); + dst += emitOutputByte(dst, (0xC0 | regCode | (code >> 8))); + } else { dst += emitOutputWord(dst, code); @@ -15656,155 +16402,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) // Does this instruction operate on a GC ref value? if (id->idGCref()) { - switch (id->idInsFmt()) - { - case IF_RRD_RRD: - break; - - case IF_RWR_RRD: - { - if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0) - { - // We're relocating "this" in the prolog - assert(emitComp->lvaIsOriginalThisArg(0)); - assert(emitComp->lvaTable[0].lvRegister); - assert(emitComp->lvaTable[0].GetRegNum() == reg1); - - if (emitFullGCinfo) - { - emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true); - break; - } - else - { - /* If emitFullGCinfo==false, the we don't use any - regPtrDsc's and so explicitly note the location - of "this" in GCEncode.cpp - */ - } - } - - emitGCregLiveUpd(id->idGCref(), reg1, dst); - break; - } - - case IF_RRW_RRD: - { - switch (id->idIns()) - { - /* - This must be one of the following cases: - - xor reg, reg to assign NULL - - and r1 , r2 if (ptr1 && ptr2) ... - or r1 , r2 if (ptr1 || ptr2) ... - - add r1 , r2 to compute a normal byref - sub r1 , r2 to compute a strange byref (VC only) - - */ - case INS_xor: - assert(reg1 == reg2); - emitGCregLiveUpd(id->idGCref(), reg1, dst); - break; - - case INS_or: - case INS_and: - emitGCregDeadUpd(reg1, dst); - break; - - case INS_add: - case INS_sub: - case INS_sub_hide: - assert(id->idGCref() == GCT_BYREF); - -#if 0 -#ifdef DEBUG - // Due to elided register moves, we can't have the following assert. - // For example, consider: - // t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx - // /--* t85 byref - // * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx - // Here, V01 is type `long` on entry, then is stored as a byref. But because - // the register allocator assigned the same register, no instruction was - // generated, and we only (currently) make gcref/byref changes in emitter GC info - // when an instruction is generated. We still generate correct GC info, as this - // instruction, if writing a GC ref even through reading a long, will go live here. - // These situations typically occur due to unsafe casting, such as with Span. - - regMaskTP regMask; - regMask = genRegMask(reg1) | genRegMask(reg2); - - // r1/r2 could have been a GCREF as GCREF + int=BYREF - // or BYREF+/-int=BYREF - assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) || - ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide))); -#endif // DEBUG -#endif // 0 - - // Mark r1 as holding a byref - emitGCregLiveUpd(GCT_BYREF, reg1, dst); - break; - - default: -#ifdef DEBUG - emitDispIns(id, false, false, false); -#endif - assert(!"unexpected GC reg update instruction"); - } - - break; - } - - case IF_RRW_RRW: - { - // This must be "xchg reg1, reg2" - assert(id->idIns() == INS_xchg); - - // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC - // register pointer mask. - - GCtype gc1, gc2; - - gc1 = emitRegGCtype(reg1); - gc2 = emitRegGCtype(reg2); - - if (gc1 != gc2) - { - // Kill the GC-info about the GC registers - - if (needsGC(gc1)) - { - emitGCregDeadUpd(reg1, dst); - } - - if (needsGC(gc2)) - { - emitGCregDeadUpd(reg2, dst); - } - - // Now, swap the info - - if (needsGC(gc1)) - { - emitGCregLiveUpd(gc1, reg2, dst); - } - - if (needsGC(gc2)) - { - emitGCregLiveUpd(gc2, reg1, dst); - } - } - break; - } - - default: -#ifdef DEBUG - emitDispIns(id, false, false, false); -#endif - assert(!"unexpected GC ref instruction format"); - } + emitHandleGCrefRegs(dst, id); } else { @@ -15849,8 +16447,9 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code_t code; instruction ins = id->idIns(); - assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins) || + IsApxExtendedEvexInstruction(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); regNumber src2 = id->idReg3(); @@ -15859,6 +16458,51 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code = insCodeRM(ins); code = AddX86PrefixIfNeeded(id, code, size); + if (IsApxExtendedEvexInstruction(ins) && !IsBMIInstruction(ins)) + { + // TODO-XArch-apx: + // For rm-like operand encoding instructions: + // legacy promoted EVEX encoding has introduced different semantic: + // op1 - vvvvv + // op2 - MODRM.REG + // op3 - MODRM.R/M + regNumber tmp = src1; + src1 = targetReg; + targetReg = tmp; + + switch (size) + { + case EA_1BYTE: + // TODO-APX : verify We should never end up here. Atleast for instructions I have looked into, we + // promote to int to do operation + noway_assert(RBM_BYTE_REGS & genRegMask(src1)); + noway_assert(RBM_BYTE_REGS & genRegMask(src2)); + noway_assert(RBM_BYTE_REGS & genRegMask(targetReg)); + break; + + case EA_2BYTE: + case EA_4BYTE: + // Set the 'w' bit to get the large version + code = insIsCMOV(ins) ? code : (code | (0x01)); + break; + +#ifdef TARGET_AMD64 + case EA_8BYTE: + // TODO-AMD64-CQ: Better way to not emit REX.W when we don't need it + // Don't need to zero out the high bits explicitly + code = AddRexWPrefix(id, code); // TODO-APX : Revisit. does xor or other cases need to be handled + // differently? see emitOutputRR + // Set the 'w' bit to get the large version + code = insIsCMOV(ins) ? code : (code | (0x01)); + break; + +#endif // TARGET_AMD64 + + default: + assert(!"unexpected size"); + } + } + code = insEncodeRMreg(id, code); if (TakesRexWPrefix(id)) @@ -15906,7 +16550,10 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) dst += emitOutputByte(dst, (0xC0 | regCode)); } - noway_assert(!id->idGCref()); + if (id->idGCref()) + { + emitHandleGCrefRegs(dst, id); + } if (!emitInsCanOnlyWriteSSE2OrAVXReg(id)) { @@ -16089,6 +16736,12 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) useACC = true; } } + + if (TakesApxExtendedEvexPrefix(id)) + { + // ACC form does not have support for promoted EVEX. + useACC = false; + } } else { @@ -16144,7 +16797,10 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); + if (!TakesApxExtendedEvexPrefix(id)) + { + dst += emitOutputByte(dst, 0x66); + } FALLTHROUGH; case EA_4BYTE: @@ -16822,7 +17478,23 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id)); + assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)); + + if (!hasTupleTypeInfo(id->idIns())) + { + // After APX, some instructions with APX features will be promoted + // to APX-EVEX, we will re-use the existing displacement emitting + // path, but for those instructions with no tuple information, + // APX-EVEX treat the scaling factor to be 1 constantly. + instruction ins = id->idIns(); + // TODO-XArch-APX: + // This assert may need tweak if BMI1 instructions are promoted + // into EVEX for multiple features, currently only EVEX.NF. + assert(IsApxExtendedEvexInstruction(id->idIns())); + *dspInByte = ((signed char)dsp == (ssize_t)dsp); + return dsp; + } + insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -17477,7 +18149,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Output a size prefix for a 16-bit operand - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { dst += emitOutputByte(dst, 0x66); } @@ -17493,6 +18165,37 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; } + case IF_RWR_RRD_SHF: + { + assert(IsApxExtendedEvexInstruction(ins)); + code = insCodeMR(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); + + // set the W bit + if (size != EA_1BYTE) + { + code |= 1; + } + + // Emit the REX prefix if it exists + if (TakesRexWPrefix(id)) + { + code = AddRexWPrefix(id, code); + } + + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + dst += emitOutputWord(dst, code); + dst += emitOutputByte(dst, emitGetInsSC(id)); + sz = emitSizeOfInsDsc_CNS(id); + + // Update GC info. + assert(!id->idGCref()); + emitGCregDeadUpd(id->idReg1(), dst); + break; + } + case IF_RRD_RRD: case IF_RWR_RRD: case IF_RRW_RRD: @@ -17566,7 +18269,105 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // Also, determine which operand goes where in the ModRM byte. regNumber mReg; regNumber rReg; - if (hasCodeMR(ins)) + if (IsApxExtendedEvexInstruction(ins)) + { + assert(hasCodeMI(ins)); + code = insCodeMI(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + mReg = id->idReg2(); + code = insEncodeMIreg(id, mReg, size, code); + rReg = REG_NA; + ssize_t val = emitGetInsSC(id); + bool valInByte = ((signed char)val == (target_ssize_t)val) && (ins != INS_mov) && (ins != INS_test); + + switch (size) + { + case EA_1BYTE: + break; + + case EA_2BYTE: + code |= EXTENDED_EVEX_PP_BITS; + FALLTHROUGH; + + case EA_4BYTE: + code |= 1; + break; + +#ifdef TARGET_AMD64 + case EA_8BYTE: + code = AddRexWPrefix(id, code); + code |= 1; + break; +#endif // TARGET_AMD64 + + default: + assert(!"unexpected size"); + } + + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + + if (valInByte && size > EA_1BYTE) + { + code |= 2; + dst += emitOutputWord(dst, code); + dst += emitOutputByte(dst, val); + } + else + { + dst += emitOutputWord(dst, code); + switch (size) + { + case EA_1BYTE: + dst += emitOutputByte(dst, val); + break; + case EA_2BYTE: + dst += emitOutputWord(dst, val); + break; + case EA_4BYTE: + dst += emitOutputLong(dst, val); + break; +#ifdef TARGET_AMD64 + case EA_8BYTE: + dst += emitOutputLong(dst, val); + break; +#endif // TARGET_AMD64 + default: + break; + } + + if (id->idIsCnsReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW); + assert(size == EA_4BYTE); + } + } + + sz = emitSizeOfInsDsc_CNS(id); + + if (!emitInsCanOnlyWriteSSE2OrAVXReg(id)) + { + emitGCregDeadUpd(id->idReg1(), dst); + } + + switch (id->idInsFmt()) + { + case IF_RWR_RRD_CNS: + assert(!instrIs3opImul(ins)); + + emitGCregDeadUpd(id->idReg1(), dst); + break; + + default: +#ifdef DEBUG + emitDispIns(id, false, false, false); +#endif + assert(!"unexpected GC ref instruction format"); + } + + break; + } + else if (hasCodeMR(ins)) { code = insCodeMR(ins); // Emit the VEX prefix if it exists @@ -17801,6 +18602,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); + if (id->idIsEvexNdContextSet() && TakesApxExtendedEvexPrefix(id)) + { + // TODO-XArch-APX: + // I'm not sure why instructions on this path can be with instruction + // format other than IF_RWR_RRD_ARD, fix here for debug purpose only, + // need revisit. + id->idInsFmt(IF_RWR_RRD_ARD); + + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8); + dst = emitOutputAM(dst, id, code | regcode); + + sz = emitSizeOfInsDsc_AMD(id); + break; + } + if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Special case 4-byte AVX instructions as the @@ -18068,7 +18886,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_RRD_SRD: case IF_RWR_RWR_SRD: { - assert(IsVexOrEvexEncodableInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // EVEX.vvvv has different semantic for APX-EVEX NDD instructions. + code = insCodeRM(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8); + dst = emitOutputSV(dst, id, code | regcode); + sz = sizeof(instrDesc); + break; + } code = insCodeRM(ins); code = AddX86PrefixIfNeeded(id, code, size); @@ -18895,7 +19725,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_inc: + case INS_inc_no_evex: case INS_dec: + case INS_dec_no_evex: case INS_neg: case INS_not: if (memFmt == IF_NONE) @@ -18980,10 +19812,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_add: + case INS_add_no_evex: case INS_sub: case INS_sub_hide: case INS_and: + case INS_and_no_evex: case INS_or: + case INS_or_no_evex: case INS_xor: if (memFmt == IF_NONE) { @@ -19133,6 +19968,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_RRW: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD: // ins reg, cl result.insThroughput = PERFSCORE_THROUGHPUT_2C; result.insLatency = PERFSCORE_LATENCY_2C; @@ -19160,6 +19997,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (insFmt) { case IF_RRW: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD: // ins reg, 1 result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; @@ -19193,6 +20032,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (insFmt) { case IF_RRW_SHF: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD_SHF: // ins reg, cns result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; @@ -20128,6 +20969,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vshuff64x2: case INS_vshufi32x4: case INS_vshufi64x2: +#ifdef TARGET_AMD64 + case INS_popcnt_evex: + case INS_lzcnt_evex: + case INS_tzcnt_evex: +#endif // TARGET_AMD64 { result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 5f820c7c022c2..0558be85e5a58 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -134,11 +134,18 @@ static regNumber getSseShiftRegNumber(instruction ins); bool HasVexEncoding(instruction ins) const; bool HasEvexEncoding(instruction ins) const; bool HasRex2Encoding(instruction ins) const; +bool HasApxNdd(instruction ins) const; +bool HasApxNf(instruction ins) const; bool IsVexEncodableInstruction(instruction ins) const; bool IsEvexEncodableInstruction(instruction ins) const; bool IsRex2EncodableInstruction(instruction ins) const; +bool IsApxNDDEncodableInstruction(instruction ins) const; +bool IsApxNFEncodableInstruction(instruction ins) const; +bool IsApxExtendedEvexInstruction(instruction ins) const; +bool IsShiftInstruction(instruction ins) const; bool IsLegacyMap1(code_t code) const; bool IsVexOrEvexEncodableInstruction(instruction ins) const; +bool DoJitUseApxNDD(instruction ins) const; code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); @@ -179,6 +186,8 @@ bool AreFlagsSetForSignJumpOpt(regNumber reg, emitAttr opSize, GenCondition cond insOpts GetEmbRoundingMode(uint8_t mode) const; +void emitHandleGCrefRegs(BYTE* dst, instrDesc* id); + bool hasRexPrefix(code_t code) { #ifdef TARGET_AMD64 @@ -332,6 +341,18 @@ void SetUseRex2Encoding(bool value) useRex2Encodings = value; } +// Is Promoted EVEX encoding supported. +bool usePromotedEVEXEncodings; +bool UsePromotedEVEXEncoding() const +{ + return usePromotedEVEXEncodings; +} + +void SetUsePromotedEVEXEncoding(bool value) +{ + usePromotedEVEXEncodings = value; +} + //------------------------------------------------------------------------ // UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported // contains Evex prefix. @@ -349,6 +370,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_CODE 0x6200000000000000ULL bool TakesEvexPrefix(const instrDesc* id) const; +bool TakesApxExtendedEvexPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -405,11 +427,7 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) // code_t AddX86PrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) { - // TODO-xarch-apx: - // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality - // of these functions are overlapping. - - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { return AddEvexPrefix(id, code, size); } @@ -445,7 +463,7 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality // of these functions are overlapping. - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { return !hasEvexPrefix(code) ? AddEvexPrefix(id, code, size) : code; } @@ -511,6 +529,48 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions) } } +//------------------------------------------------------------------------ +// SetEvexNdIfNeeded: set NDD form - new data destination if needed. +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexNdIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + assert(IsApxNDDEncodableInstruction(id->idIns())); + id->idSetEvexNdContext(); + } + else + { + assert((instOptions & INS_OPTS_EVEX_nd_MASK) == 0); + } +} + +//------------------------------------------------------------------------ +// SetEvexNdIfNeeded: set Evex.nf on instrDesc +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_nf_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + assert(IsApxNFEncodableInstruction(id->idIns())); + id->idSetEvexNfContext(); + } + else + { + assert((instOptions & INS_OPTS_EVEX_nf_MASK) == 0); + } +} + //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix. // Check if the prefix already exists befpre adding. @@ -753,7 +813,7 @@ void emitIns_Data16(); void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val); -void emitIns_R(instruction ins, emitAttr attr, regNumber reg); +void emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions = INS_OPTS_NONE); void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs); @@ -762,7 +822,9 @@ void emitIns_A(instruction ins, emitAttr attr, GenTreeIndir* indir); void emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, - ssize_t val DEBUGARG(size_t targetHandle = 0) DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY)); + ssize_t val, + insOpts instOptions = INS_OPTS_NONE DEBUGARG(size_t targetHandle = 0) + DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY)); void emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regNumber srgReg, bool canSkip); diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 5ec40ea333973..d52a60fc8db26 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -221,6 +221,12 @@ enum insFlags : uint64_t // APX: REX2 prefix: Encoding_REX2 = 1ULL << 44, + // APX: EVEX.ND: + INS_Flags_Has_NDD = 1ULL << 45, + + // APX: EVEX.NF: + INS_Flags_Has_NF = 1ULL << 46, + // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, }; @@ -259,6 +265,17 @@ enum insOpts: unsigned INS_OPTS_EVEX_z_MASK = 0x20, // mask for EVEX.z related features INS_OPTS_EVEX_em_zero = 1 << 5, // Embedded mask merges with zero + + // One-bit: 0b0100_0000 + INS_OPTS_EVEX_nd_MASK = 0x40, // mask for APX-EVEX.nd related features + + INS_OPTS_EVEX_nd = 1 << 6, // NDD form for legacy instructions + + // One-bit: 0b1000_0000 + INS_OPTS_EVEX_nf_MASK = 0x80, // mask for APX-EVEX.nf related features + + INS_OPTS_EVEX_nf = 1 << 7, // NDD form for legacy instructions + }; #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index fbc635ab5553b..c48b5f5f1876d 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -58,26 +58,31 @@ INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, Encoding_REX2) INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, Encoding_REX2) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit |Encoding_REX2) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(inc_no_evex, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(dec_no_evex, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, Encoding_REX2) // id nm um mr mi rm a4 tt flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(add_no_evex, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(or_no_evex, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(and_no_evex, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) // Does not affect the stack tracking in the emitter INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) @@ -99,25 +104,25 @@ INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, #endif INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) -INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NDD | INS_Flags_Has_NF) // id nm um mr mi rm tt flags @@ -125,25 +130,25 @@ INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, // as 2-operand instructions with the target register being implicit // implicit_reg = op1*op2_icon #define INSTMUL INST3 -INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) #ifdef TARGET_AMD64 -INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) #endif // TARGET_AMD64 @@ -588,11 +593,11 @@ INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BA INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Logical AND NOT -INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Bit Field Extract -INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Extract Lowest Set Isolated Bit -INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Get Mask Up to Lowest Set Bit -INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Reset Lowest Set Bit +INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Logical AND NOT +INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Bit Field Extract +INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Extract Lowest Set Isolated Bit +INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Get Mask Up to Lowest Set Bit +INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit // BMI2 INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position @@ -899,35 +904,43 @@ INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) +#if defined(TARGET_AMD64) +INST3(tzcnt_evex, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F4, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Count the Number of Trailing Zero Bits +INST3(lzcnt_evex, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F5, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) +INST3(popcnt_evex, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x000088, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | INS_Flags_Has_NF) +#endif // TARGET_AMD64 + +INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) + +INST3(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, 0x0000D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, 0x0008D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, 0x0010D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST3(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, 0x0018D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST3(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, 0x0020D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, 0x0028D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, 0x0038D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + // id nm um mr mi flags INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None) INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None) INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, Encoding_REX2) -INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) - -INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) - - // id nm um mr flags INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) @@ -959,15 +972,12 @@ INST1(leave, "leave", IUM_RD, 0x0000C9, INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None) -INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2) - INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None) INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None) -INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) +INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | INS_Flags_Has_NF) +INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) +INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 4c5fc2e8d5328..0f62b7bf762d6 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -369,8 +369,9 @@ RELEASE_CONFIG_INTEGER(EnableMultiRegLocals, "EnableMultiRegLocals", 1) RELEASE_CONFIG_INTEGER(JitNoInline, "JitNoInline", 0) #if defined(DEBUG) -CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for legacy instructions. -CONFIG_INTEGER(JitBypassAPXCheck, "JitBypassAPXCheck", 0) // Bypass APX CPUID check. +CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for compatible instructions. +CONFIG_INTEGER(JitStressPromotedEvexEncoding, "JitStressPromotedEvexEncoding", 0) // Enable promoted EVEX encoding for + // compatible instructions. #endif // clang-format off @@ -440,6 +441,7 @@ RELEASE_CONFIG_INTEGER(EnableArm64Sve, "EnableArm64Sve", RELEASE_CONFIG_INTEGER(EnableEmbeddedBroadcast, "EnableEmbeddedBroadcast", 1) // Allows embedded broadcasts to be disabled RELEASE_CONFIG_INTEGER(EnableEmbeddedMasking, "EnableEmbeddedMasking", 1) // Allows embedded masking to be disabled +RELEASE_CONFIG_INTEGER(EnableApxNDD, "EnableApxNDD", 0) // Allows APX NDD feature to be disabled // clang-format on