diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index bd80187eea6a4..e8723723cda4c 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -433,12 +433,13 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr       size,
             else
             {
                 // For section constant, the immediate will be relocatable
-                GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags));
+                GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm,
+                                          INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
             }
         }
         else
         {
-            GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags));
+            GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
         }
     }
     regSet.verifyRegUsed(reg);
@@ -769,12 +770,18 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
     {
         GenTree* operand = tree->gtGetOp1();
         assert(operand->isUsedFromReg());
-        regNumber operandReg = genConsumeReg(operand);
+        regNumber   operandReg = genConsumeReg(operand);
+        instruction ins        = genGetInsForOper(tree->OperGet(), targetType);
 
-        inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
-
-        instruction ins = genGetInsForOper(tree->OperGet(), targetType);
-        inst_RV(ins, targetReg, targetType);
+        if (GetEmitter()->DoJitUseApxNDD(ins) && (targetReg != operandReg))
+        {
+            GetEmitter()->emitIns_R_R(ins, emitTypeSize(operand), targetReg, operandReg, INS_OPTS_EVEX_nd);
+        }
+        else
+        {
+            inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
+            inst_RV(ins, targetReg, targetType);
+        }
     }
 
     genProduceReg(tree);
@@ -1189,12 +1196,49 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
     // reg3 = reg3 op reg2
     else
     {
-        var_types op1Type = op1->TypeGet();
-        inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
-        regSet.verifyRegUsed(targetReg);
-        gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
-        dst = treeNode;
-        src = op2;
+        if (emit->DoJitUseApxNDD(ins) && !varTypeIsFloating(treeNode))
+        {
+            // TODO-xarch-apx:
+            // APX can provide optimal code gen in this case using NDD feature:
+            // reg3 = op1 op op2 without extra mov
+
+            // see if it can be optimized by inc/dec
+            if (oper == GT_ADD && op2->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
+            {
+                if (op2->IsIntegralConst(1))
+                {
+                    emit->emitIns_R_R(INS_inc, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd);
+                    genProduceReg(treeNode);
+                    return;
+                }
+                else if (op2->IsIntegralConst(-1))
+                {
+                    emit->emitIns_R_R(INS_dec, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd);
+                    genProduceReg(treeNode);
+                    return;
+                }
+            }
+
+            assert(op1reg != targetReg);
+            assert(op2reg != targetReg);
+            emit->emitInsBinary(ins, emitTypeSize(treeNode), op1, op2, targetReg);
+            if (treeNode->gtOverflowEx())
+            {
+                assert(oper == GT_ADD || oper == GT_SUB);
+                genCheckOverflow(treeNode);
+            }
+            genProduceReg(treeNode);
+            return;
+        }
+        else
+        {
+            var_types op1Type = op1->TypeGet();
+            inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
+            regSet.verifyRegUsed(targetReg);
+            gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
+            dst = treeNode;
+            src = op2;
+        }
     }
 
     // try to use an inc or dec
@@ -1213,6 +1257,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
             return;
         }
     }
+
     regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
     noway_assert(r == targetReg);
 
@@ -1326,6 +1371,24 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
         }
         assert(regOp->isUsedFromReg());
 
+        if (emit->DoJitUseApxNDD(ins) && regOp->GetRegNum() != mulTargetReg)
+        {
+            // use NDD form to optimize this form:
+            // mov  targetReg, regOp
+            // imul targetReg, rmOp
+            // to imul targetReg, regOp rmOp.
+            emit->emitInsBinary(ins, size, regOp, rmOp, mulTargetReg);
+            if (requiresOverflowCheck)
+            {
+                // Overflow checking is only used for non-floating point types
+                noway_assert(!varTypeIsFloating(treeNode));
+
+                genCheckOverflow(treeNode);
+            }
+            genProduceReg(treeNode);
+            return;
+        }
+
         // Setup targetReg when neither of the source operands was a matching register
         inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true);
 
@@ -4438,23 +4501,23 @@ void CodeGen::genCodeForLockAdd(GenTreeOp* node)
         if (imm == 1)
         {
             // inc [addr]
-            GetEmitter()->emitIns_AR(INS_inc, size, addr->GetRegNum(), 0);
+            GetEmitter()->emitIns_AR(INS_inc_no_evex, size, addr->GetRegNum(), 0);
         }
         else if (imm == -1)
         {
             // dec [addr]
-            GetEmitter()->emitIns_AR(INS_dec, size, addr->GetRegNum(), 0);
+            GetEmitter()->emitIns_AR(INS_dec_no_evex, size, addr->GetRegNum(), 0);
         }
         else
         {
             // add [addr], imm
-            GetEmitter()->emitIns_I_AR(INS_add, size, imm, addr->GetRegNum(), 0);
+            GetEmitter()->emitIns_I_AR(INS_add_no_evex, size, imm, addr->GetRegNum(), 0);
         }
     }
     else
     {
         // add [addr], data
-        GetEmitter()->emitIns_AR_R(INS_add, size, data->GetRegNum(), addr->GetRegNum(), 0);
+        GetEmitter()->emitIns_AR_R(INS_add_no_evex, size, data->GetRegNum(), addr->GetRegNum(), 0);
     }
 }
 
@@ -4481,7 +4544,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* node)
 
     if (node->OperIs(GT_XORR, GT_XAND))
     {
-        const instruction ins = node->OperIs(GT_XORR) ? INS_or : INS_and;
+        const instruction ins = node->OperIs(GT_XORR) ? INS_or_no_evex : INS_and_no_evex;
 
         if (node->IsUnusedValue())
         {
@@ -4873,6 +4936,24 @@ void CodeGen::genCodeForShift(GenTree* tree)
                 genProduceReg(tree);
                 return;
             }
+
+            if (GetEmitter()->DoJitUseApxNDD(ins) && (tree->GetRegNum() != operandReg))
+            {
+                ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
+                // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
+                // this case might be rarely hit.
+                if (shiftByValue == 1)
+                {
+                    GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd);
+                }
+                else
+                {
+                    GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue,
+                                                INS_OPTS_EVEX_nd);
+                }
+                genProduceReg(tree);
+                return;
+            }
 #endif
             // First, move the operand to the destination register and
             // later on perform the shift in-place.
@@ -4919,6 +5000,15 @@ void CodeGen::genCodeForShift(GenTree* tree)
         // The operand to be shifted must not be in ECX
         noway_assert(operandReg != REG_RCX);
 
+        if (GetEmitter()->DoJitUseApxNDD(ins) && (tree->GetRegNum() != operandReg))
+        {
+            // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
+            // this case might be rarely hit.
+            GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd);
+            genProduceReg(tree);
+            return;
+        }
+
         inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true);
         inst_RV(ins, tree->GetRegNum(), targetType);
     }
@@ -9270,6 +9360,87 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
 
     theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0);
     theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0);
+
+    // APX-EVEX
+
+    theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+    theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd);
+
+    theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_R(INS_tzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_lzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_popcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_S(INS_tzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_lzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_popcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11,
+                              (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd));
+
+    theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
+
+    theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
+    theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1);
 }
 
 #endif // defined(DEBUG) && defined(TARGET_AMD64)
@@ -11314,7 +11485,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
     if (barrierKind == BARRIER_FULL)
     {
         instGen(INS_lock);
-        GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0);
+        GetEmitter()->emitIns_I_AR(INS_or_no_evex, EA_4BYTE, 0, REG_SPBASE, 0);
     }
 }
 
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index a3d854e17eff0..0de153935f77e 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2299,6 +2299,7 @@ void Compiler::compSetProcessor()
         if (canUseApxEncoding())
         {
             codeGen->GetEmitter()->SetUseRex2Encoding(true);
+            codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true);
         }
     }
 #endif // TARGET_XARCH
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 5160d287cb311..36bca23166a49 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -3942,7 +3942,7 @@ class Compiler
 
     // false: we can add new tracked variables.
     // true: We cannot add new 'tracked' variable
-    bool     lvaTrackedFixed = false; 
+    bool     lvaTrackedFixed = false;
 
     unsigned lvaCount;        // total number of locals, which includes function arguments,
                               // special arguments, IL local variables, and JIT temporary variables
@@ -6849,7 +6849,7 @@ class Compiler
     unsigned acdCount = 0;
 
     // Get the index to use as part of the AddCodeDsc key for sharing throw blocks
-    unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg); 
+    unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg);
 
     struct AddCodeDscKey
     {
@@ -6857,7 +6857,7 @@ class Compiler
         AddCodeDscKey(): acdKind(SCK_NONE), acdData(0) {}
         AddCodeDscKey(SpecialCodeKind kind, BasicBlock* block, Compiler* comp);
         AddCodeDscKey(AddCodeDsc* add);
-        
+
         static bool Equals(const AddCodeDscKey& x, const AddCodeDscKey& y)
         {
             return (x.acdData == y.acdData) && (x.acdKind == y.acdKind);
@@ -9992,13 +9992,30 @@ class Compiler
     // JitStressEvexEncoding- Answer the question: Is Evex stress knob set
     //
     // Returns:
-    //    `true` if user requests REX2 encoding.
+    //    `true` if user requests EVEX encoding.
     //
     bool JitStressEvexEncoding() const
     {
 #ifdef DEBUG
         return JitConfig.JitStressEvexEncoding() || JitConfig.JitStressRex2Encoding();
 #endif // DEBUG
+        return false;
+    }
+
+    //------------------------------------------------------------------------
+    // DoJitStressPromotedEvexEncoding- Answer the question: Do we force promoted EVEX encoding.
+    //
+    // Returns:
+    //    `true` if user requests promoted EVEX encoding.
+    //
+    bool DoJitStressPromotedEvexEncoding() const
+    {
+#ifdef DEBUG
+        if (JitConfig.JitStressPromotedEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_APX))
+        {
+            return true;
+        }
+#endif // DEBUG
 
         return false;
     }
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index dc0f977b60862..6c0429cd3a0bf 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -471,6 +471,7 @@ class emitter
         SetUseVEXEncoding(false);
         SetUseEvexEncoding(false);
         SetUseRex2Encoding(false);
+        SetUsePromotedEVEXEncoding(false);
 #endif // TARGET_XARCH
 
         emitDataSecCur = nullptr;
@@ -793,8 +794,15 @@ class emitter
         // For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length.
         // For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode.
         // Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits.
-        unsigned _idEvexbContext : 2;
-#endif //  TARGET_XARCH
+        unsigned _idCustom5 : 1;
+        unsigned _idCustom6 : 1;
+
+#define _idEvexbContext                                                                                                \
+    (_idCustom6 << 1) | _idCustom5  /* Evex.b: embedded broadcast, embedded rounding, embedded SAE                     \
+                                     */
+#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
+#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */
+#endif                              //  TARGET_XARCH
 
 #ifdef TARGET_ARM64
         unsigned _idLclVar     : 1; // access a local on stack
@@ -1657,38 +1665,17 @@ class emitter
 #ifdef TARGET_XARCH
         bool idIsEvexbContextSet() const
         {
-            return _idEvexbContext != 0;
+            return idGetEvexbContext() != 0;
         }
 
         void idSetEvexbContext(insOpts instOptions)
         {
             assert(!idIsEvexbContextSet());
+            assert(idGetEvexbContext() == 0);
+            unsigned value = static_cast<unsigned>(instOptions & INS_OPTS_EVEX_b_MASK);
 
-            switch (instOptions & INS_OPTS_EVEX_b_MASK)
-            {
-                case INS_OPTS_EVEX_eb_er_rd:
-                {
-                    _idEvexbContext = 1;
-                    break;
-                }
-
-                case INS_OPTS_EVEX_er_ru:
-                {
-                    _idEvexbContext = 2;
-                    break;
-                }
-
-                case INS_OPTS_EVEX_er_rz:
-                {
-                    _idEvexbContext = 3;
-                    break;
-                }
-
-                default:
-                {
-                    unreached();
-                }
-            }
+            _idCustom5 = ((value >> 0) & 1);
+            _idCustom6 = ((value >> 1) & 1);
         }
 
         unsigned idGetEvexbContext() const
@@ -1728,6 +1715,28 @@ class emitter
             assert(!idIsEvexZContextSet());
             _idEvexZContext = 1;
         }
+
+        bool idIsEvexNdContextSet() const
+        {
+            return _idEvexNdContext != 0;
+        }
+
+        void idSetEvexNdContext()
+        {
+            assert(!idIsEvexNdContextSet());
+            _idEvexNdContext = 1;
+        }
+
+        bool idIsEvexNfContextSet() const
+        {
+            return _idEvexNfContext != 0;
+        }
+
+        void idSetEvexNfContext()
+        {
+            assert(!idIsEvexNfContextSet());
+            _idEvexNfContext = 1;
+        }
 #endif
 
 #ifdef TARGET_ARMARCH
@@ -2531,7 +2540,12 @@ class emitter
     CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue);
 #endif // FEATURE_MASKED_HW_INTRINSICS
 #endif // FEATURE_SIMD
+
+#if defined(TARGET_XARCH)
+    regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg = REG_NA);
+#else
     regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src);
+#endif
     regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2);
     void      emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem);
     void      emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem);
diff --git a/src/coreclr/jit/emitfmtsxarch.h b/src/coreclr/jit/emitfmtsxarch.h
index f893fce8d07ee..a94a7c1b3e7d5 100644
--- a/src/coreclr/jit/emitfmtsxarch.h
+++ b/src/coreclr/jit/emitfmtsxarch.h
@@ -140,6 +140,7 @@ IF_DEF(RRW_RRW,         IS_R1_RW|IS_R2_RW,                   NONE)      // r/w
 IF_DEF(RRD_RRD_CNS,     IS_R1_RD|IS_R2_RD,                   SCNS)      // read  reg1,  read  reg2,  const
 IF_DEF(RWR_RRD_CNS,     IS_R1_WR|IS_R2_RD,                   SCNS)      // write reg1,  read  reg2,  const
 IF_DEF(RRW_RRD_CNS,     IS_R1_RW|IS_R2_RD,                   SCNS)      // r/w   reg1,  read  reg2,  const
+IF_DEF(RWR_RRD_SHF,     IS_R1_WR|IS_R2_RD,                   SCNS)      // write reg1,  read  reg2,  shift
 
 IF_DEF(RRD_RRD_RRD,     IS_R1_RD|IS_R2_RD|IS_R3_RD,          NONE)      // read  reg1,  read  reg2,  read  reg3
 IF_DEF(RWR_RRD_RRD,     IS_R1_WR|IS_R2_RD|IS_R3_RD,          NONE)      // write reg1,  read  reg2,  read  reg3
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 70f54f021c937..e1b352b39edc5 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -236,6 +236,18 @@ bool emitter::HasRex2Encoding(instruction ins) const
     return (flags & Encoding_REX2) != 0;
 }
 
+bool emitter::HasApxNdd(instruction ins) const
+{
+    insFlags flags = CodeGenInterface::instInfo[ins];
+    return (flags & INS_Flags_Has_NDD) != 0;
+}
+
+bool emitter::HasApxNf(instruction ins) const
+{
+    insFlags flags = CodeGenInterface::instInfo[ins];
+    return (flags & INS_Flags_Has_NF) != 0;
+}
+
 bool emitter::IsVexEncodableInstruction(instruction ins) const
 {
     if (!UseVEXEncoding())
@@ -286,6 +298,8 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const
 //
 bool emitter::IsRex2EncodableInstruction(instruction ins) const
 {
+    // TODO-Xarch-apx: we have special stress mode for REX2 on non-compatible machine, that will
+    //                 force UseRex2Encoding return true regardless of the CPUID results.
     if (!UseRex2Encoding())
     {
         return false;
@@ -293,6 +307,106 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const
     return HasRex2Encoding(ins);
 }
 
+//------------------------------------------------------------------------
+// IsApxNDDEncodableInstruction: Answer the question- does this instruction have apx ndd form.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    `true` if ins has apx ndd form.
+//
+bool emitter::IsApxNDDEncodableInstruction(instruction ins) const
+{
+    if (!UsePromotedEVEXEncoding())
+    {
+        return false;
+    }
+
+    return HasApxNdd(ins);
+}
+
+//------------------------------------------------------------------------
+// IsApxNFEncodableInstruction: Answer the question - does this instruction have Evex.nf supported
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    `true` if ins is Evex.nf supported.
+//
+bool emitter::IsApxNFEncodableInstruction(instruction ins) const
+{
+    if (!UsePromotedEVEXEncoding())
+    {
+        return false;
+    }
+
+    return HasApxNf(ins);
+}
+
+//------------------------------------------------------------------------
+// IsApxExtendedEvexInstruction: Answer the question - does this instruction have apx extended evex form.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    `true` if ins has apx extended evex form.
+//
+bool emitter::IsApxExtendedEvexInstruction(instruction ins) const
+{
+    if (!UsePromotedEVEXEncoding())
+    {
+        return false;
+    }
+
+    return HasApxNdd(ins) || HasApxNf(ins);
+}
+
+//------------------------------------------------------------------------
+// IsShiftInstruction: Answer the question- is this instruction a shift instruction.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    `true` if ins is a shift instruction.
+//
+bool emitter::IsShiftInstruction(instruction ins) const
+{
+    switch (ins)
+    {
+        case INS_rcl_1:
+        case INS_rcr_1:
+        case INS_rol_1:
+        case INS_ror_1:
+        case INS_shl_1:
+        case INS_shr_1:
+        case INS_sar_1:
+
+        case INS_rcl:
+        case INS_rcr:
+        case INS_rol:
+        case INS_ror:
+        case INS_shl:
+        case INS_shr:
+        case INS_sar:
+
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
 //------------------------------------------------------------------------
 // IsLegacyMap1: Answer the question- Is this instruction on legacy-map-1
 //
@@ -324,7 +438,7 @@ bool emitter::IsLegacyMap1(code_t code) const
 
     if ((code & 0xFF00FF00) == 0x0F000000)
     {
-        // 4-byte, need to check if PP is a prefix.
+        // 4-byte, need to check if PP is prefixs
         BYTE prefix = (BYTE)((code & 0xFF0000) >> 16);
         return ((prefix == 0xF2) || (prefix == 0xF3) || (prefix == 0x66));
     }
@@ -647,6 +761,24 @@ bool emitter::IsRexW1EvexInstruction(instruction ins)
     return false;
 }
 
+//------------------------------------------------------------------------
+// DoJitUseApxNDD: Answer the question: does JIT use APX NDD feature on the given instruction?
+//
+// Arguments:
+//    ins - instruction to test
+//
+// Return Value:
+//    true if JIT allows APX NDD to be applied on the instructions.
+//
+bool emitter::DoJitUseApxNDD(instruction ins) const
+{
+#if !defined(TARGET_AMD64)
+    return false;
+#else
+    return JitConfig.EnableApxNDD() && IsApxNDDEncodableInstruction(ins);
+#endif
+}
+
 #ifdef TARGET_64BIT
 //------------------------------------------------------------------------
 // AreUpperBitsZero: check if some previously emitted
@@ -1257,6 +1389,179 @@ insOpts emitter::GetEmbRoundingMode(uint8_t mode) const
     }
 }
 
+//------------------------------------------------------------------------
+// emitHandleGCrefRegs: Update GC ref related registers' liveness.
+//
+// Arguments:
+//   dst - Destination buffer.
+//   id  - instruction descriptor to the GC ref instruction.
+//
+void emitter::emitHandleGCrefRegs(BYTE* dst, instrDesc* id)
+{
+    regNumber reg1 = id->idReg1(); // dst and src1
+    regNumber reg2 = id->idReg2(); // src2
+    switch (id->idInsFmt())
+    {
+        case IF_RRD_RRD:
+            break;
+
+        case IF_RWR_RRD:
+        {
+            if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0)
+            {
+                // We're relocating "this" in the prolog
+                assert(emitComp->lvaIsOriginalThisArg(0));
+                assert(emitComp->lvaTable[0].lvRegister);
+                assert(emitComp->lvaTable[0].GetRegNum() == reg1);
+
+                if (emitFullGCinfo)
+                {
+                    emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true);
+                    break;
+                }
+                else
+                {
+                    /* If emitFullGCinfo==false, the we don't use any
+                       regPtrDsc's and so explicitly note the location
+                       of "this" in GCEncode.cpp
+                     */
+                }
+            }
+
+            emitGCregLiveUpd(id->idGCref(), reg1, dst);
+            break;
+        }
+
+        case IF_RRW_RRD:
+        case IF_RWR_RRD_RRD:
+        {
+            regNumber targetReg = reg1; // dst
+
+            // if the instructions is encoded in NDD form,
+            // src registers will be the 2nd and 3rd register on id.
+            if (id->idInsFmt() == IF_RWR_RRD_RRD)
+            {
+                reg1 = id->idReg2(); // src1
+                reg2 = id->idReg3(); // src2
+            }
+
+            switch (id->idIns())
+            {
+                /*
+                    This must be one of the following cases:
+
+                    xor reg, reg        to assign NULL
+
+                    and r1 , r2         if (ptr1 && ptr2) ...
+                    or  r1 , r2         if (ptr1 || ptr2) ...
+
+                    add r1 , r2         to compute a normal byref
+                    sub r1 , r2         to compute a strange byref (VC only)
+
+                */
+                case INS_xor:
+                    assert(reg1 == reg2);
+                    emitGCregLiveUpd(id->idGCref(), targetReg, dst);
+                    break;
+
+                case INS_or:
+                case INS_and:
+                    emitGCregDeadUpd(targetReg, dst);
+                    break;
+
+                case INS_add:
+                case INS_sub:
+                case INS_sub_hide:
+                    assert(id->idGCref() == GCT_BYREF);
+
+#if 0
+#ifdef DEBUG
+                        // Due to elided register moves, we can't have the following assert.
+                        // For example, consider:
+                        //    t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
+                        //        /--*  t85    byref
+                        //        *  STORE_LCL_VAR byref  V40 tmp31 rdx REG rdx
+                        // Here, V01 is type `long` on entry, then is stored as a byref. But because
+                        // the register allocator assigned the same register, no instruction was
+                        // generated, and we only (currently) make gcref/byref changes in emitter GC info
+                        // when an instruction is generated. We still generate correct GC info, as this
+                        // instruction, if writing a GC ref even through reading a long, will go live here.
+                        // These situations typically occur due to unsafe casting, such as with Span<T>.
+
+                        regMaskTP regMask;
+                        regMask = genRegMask(reg1) | genRegMask(reg2);
+
+                        // r1/r2 could have been a GCREF as GCREF + int=BYREF
+                        //                               or BYREF+/-int=BYREF
+                        assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) ||
+                               ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)));
+#endif // DEBUG
+#endif // 0
+
+                    // Mark r1 as holding a byref
+                    emitGCregLiveUpd(GCT_BYREF, targetReg, dst);
+                    break;
+
+                default:
+#ifdef DEBUG
+                    emitDispIns(id, false, false, false);
+#endif
+                    assert(!"unexpected GC reg update instruction");
+            }
+
+            break;
+        }
+
+        case IF_RRW_RRW:
+        {
+            // This must be "xchg reg1, reg2"
+            assert(id->idIns() == INS_xchg);
+
+            // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC
+            // register pointer mask.
+
+            GCtype gc1, gc2;
+
+            gc1 = emitRegGCtype(reg1);
+            gc2 = emitRegGCtype(reg2);
+
+            if (gc1 != gc2)
+            {
+                // Kill the GC-info about the GC registers
+
+                if (needsGC(gc1))
+                {
+                    emitGCregDeadUpd(reg1, dst);
+                }
+
+                if (needsGC(gc2))
+                {
+                    emitGCregDeadUpd(reg2, dst);
+                }
+
+                // Now, swap the info
+
+                if (needsGC(gc1))
+                {
+                    emitGCregLiveUpd(gc1, reg2, dst);
+                }
+
+                if (needsGC(gc2))
+                {
+                    emitGCregLiveUpd(gc2, reg1, dst);
+                }
+            }
+            break;
+        }
+
+        default:
+#ifdef DEBUG
+            emitDispIns(id, false, false, false);
+#endif
+            assert(!"unexpected GC ref instruction format");
+    }
+}
+
 //------------------------------------------------------------------------
 // encodeRegAsIval: Encodes a register as an ival for use by a SIMD instruction
 //
@@ -1343,9 +1648,23 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
         return true;
     }
 
+    if (id->idIsEvexNfContextSet() && IsBMIInstruction(ins))
+    {
+        // Only a few BMI instructions shall be promoted to APX-EVEX due to NF feature.
+        // TODO-XArch-APX: convert the check into forms like Has* as above.
+        return true;
+    }
+
 #if defined(DEBUG)
     if (emitComp->DoJitStressEvexEncoding())
     {
+        if (IsBMIInstruction(ins))
+        {
+            // The Encoding_EVEX on some BMI instructions is tagged due to APX,
+            // they cannot be stressed with JitStressEvexEncoding.
+            return false;
+        }
+
         // Requires the EVEX encoding due to STRESS mode and no change in semantics
         //
         // Some instructions, like VCMPEQW return the value in a SIMD register for
@@ -1354,6 +1673,12 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
         // check above so we need to still return false here to preserve semantics.
         return !HasKMaskRegisterDest(ins);
     }
+
+    if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding())
+    {
+        // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX.
+        return (IsBMIInstruction(ins) && HasApxNf(ins));
+    }
 #endif // DEBUG
 
     if ((ins == INS_pslldq) || (ins == INS_psrldq))
@@ -1408,6 +1733,52 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
     return false;
 }
 
+//------------------------------------------------------------------------
+// TakesApxExtendedEvexPrefix: Checks if the instruction should be legacy-promoted-EVEX encoded.
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    true if this instruction requires a legacy-promoted-EVEX prefix.
+//
+bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const
+{
+    // TODO-XArch-APX:
+    // Isolating legacy-promoted-EVEX case out from VEX/EVEX-promoted-EVEX,
+    // as the latter ones are relatively simple, providing EGPRs functionality,
+    instruction ins = id->idIns();
+    if (!IsApxExtendedEvexInstruction(ins))
+    {
+        return false;
+    }
+
+    if (IsAvx512OrPriorInstruction(ins))
+    {
+        // This check should reject any instruction not from legacy map-0 or 1.
+        return false;
+    }
+
+    if (id->idIsEvexNdContextSet())
+    {
+        return true;
+    }
+
+    if (id->idIsEvexNfContextSet())
+    {
+        return true;
+    }
+
+#if defined(DEBUG)
+    if (emitComp->DoJitStressPromotedEvexEncoding())
+    {
+        return true;
+    }
+#endif // DEBUG
+
+    return false;
+}
+
 // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section
 // 2.6.
 // Add base EVEX prefix without setting W, R, X, or B bits
@@ -1442,6 +1813,10 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
 #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL
 #define ZBIT_IN_BYTE_EVEX_PREFIX      0x0000008000000000ULL
 
+#define MAP4_IN_BYTE_EVEX_PREFIX   0x4000000000000ULL
+#define ND_BIT_IN_BYTE_EVEX_PREFIX 0x1000000000ULL
+#define NF_BIT_IN_BYTE_EVEX_PREFIX 0x400000000ULL
+#define EXTENDED_EVEX_PP_BITS      0x10000000000ULL
 //------------------------------------------------------------------------
 // AddEvexPrefix: Add default EVEX prefix with only LL' bits set.
 //
@@ -1451,12 +1826,23 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
 //    attr -- operand size
 //
 // Return Value:
-//    encoded code with Evex prefix.
+//    encoded code with EVEX prefix.
 //
 emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAttr attr)
 {
     // Only AVX512 instructions require EVEX prefix
-    assert(IsEvexEncodableInstruction(id->idIns()));
+    // After APX, some instructions in legacy or VEX space will be promoted to EVEX.
+    instruction ins = id->idIns();
+    assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+
+    if (instrIsExtendedReg3opImul(ins))
+    {
+        // the only case imul(0x68) will need EVEX prefix is EVEX.NF feature enabled.
+        // imul(0x68) opcode comes with ModR/M.REG byte to indicate implicit register use,
+        // when it is using extended registers (>= REG_R8), it comes with built-in REX prefix,
+        // remove them first and add the counter part in EVEX.
+        code &= 0xFFFFFFFF;
+    }
 
     // Shouldn't have already added EVEX prefix
     assert(!hasEvexPrefix(code));
@@ -1465,6 +1851,48 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt
 
     code |= DEFAULT_BYTE_EVEX_PREFIX;
 
+    if (IsApxExtendedEvexInstruction(ins))
+    {
+        if (!HasEvexEncoding(ins))
+        {
+            // Legacy-promoted insutrcions are not labeled with Encoding_EVEX.
+            code |= MAP4_IN_BYTE_EVEX_PREFIX;
+        }
+
+        // TODO-XArch-APX:
+        // verify if it is actually safe to reuse the EVEX.ND with EVEX.B on instrDesc.
+        if (id->idIsEvexNdContextSet())
+        {
+            code |= ND_BIT_IN_BYTE_EVEX_PREFIX;
+        }
+
+        if (id->idIsEvexNfContextSet())
+        {
+            code |= NF_BIT_IN_BYTE_EVEX_PREFIX;
+        }
+
+        if (attr == EA_2BYTE)
+        {
+            code |= EXTENDED_EVEX_PP_BITS;
+        }
+
+        if (instrIsExtendedReg3opImul(ins))
+        {
+            // EVEX.R3
+            // TODO-XArch-APX:
+            // A few side notes: based on how JIT defined IMUL, we may need to extend
+            // the definition to `IMUL_31` to cover EGPRs. And it can be defined in a
+            // similar way that opcodes comes with built-in REX2 prefix, and convert
+            // it to EVEX when needed with some helper functions.
+            code &= 0xFF7FFFFFFFFFFFFFULL;
+        }
+
+        return code;
+    }
+
+    // No APX-NDD instructions should reach code below.
+    assert(!IsApxExtendedEvexInstruction(ins));
+
     if (attr == EA_32BYTE)
     {
         // Set EVEX.L'L bits to 01 in case of instructions that operate on 256-bits.
@@ -2008,6 +2436,14 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code)
         }
     }
 #ifdef TARGET_AMD64
+    else if (TakesApxExtendedEvexPrefix(id))
+    {
+        // If the instruction is not VEX/EVEX encodable, and has EVEX prefix,
+        // then it is legacy promoted EVEX.
+        assert(hasEvexPrefix(code));
+        assert(IsApxExtendedEvexInstruction(ins));
+        return emitter::code_t(code | 0x0000800000000000ULL);
+    }
     else if (hasRex2Prefix(code))
     {
         return emitter::code_t(code | 0x000800000000ULL);
@@ -2046,13 +2482,18 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code)
             return code & 0xFF7FFFFFFFFFFFULL;
         }
     }
-#ifdef TARGET_AMD64
+    else if (TakesApxExtendedEvexPrefix(id))
+    {
+        assert(hasEvexPrefix(code));
+        assert(IsApxExtendedEvexInstruction(ins));
+        // R-bit is added in bit-inverted form.
+        return code & 0xFF7FFFFFFFFFFFFFULL;
+    }
     else if (TakesRex2Prefix(id))
     {
         assert(IsRex2EncodableInstruction(ins));
         return code |= 0xD50400000000ULL; // REX2.B3
     }
-#endif // TARGET_AMD64
 
     return code | 0x4400000000ULL;
 }
@@ -2082,13 +2523,18 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code)
             return code & 0xFFBFFFFFFFFFFFULL;
         }
     }
-#ifdef TARGET_AMD64
+    else if (TakesApxExtendedEvexPrefix(id))
+    {
+        assert(hasEvexPrefix(code));
+        assert(IsApxExtendedEvexInstruction(ins));
+        // X-bit is added in bit-inverted form.
+        return code & 0xFFBFFFFFFFFFFFFFULL;
+    }
     else if (TakesRex2Prefix(id))
     {
         assert(IsRex2EncodableInstruction(ins));
         return code |= 0xD50200000000ULL; // REX2.B3
     }
-#endif // TARGET_AMD64
 
     return code | 0x4200000000ULL;
 }
@@ -2118,13 +2564,17 @@ emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code)
             return code & 0xFFDFFFFFFFFFFFULL;
         }
     }
-#ifdef TARGET_AMD64
+    else if (TakesApxExtendedEvexPrefix(id))
+    {
+        assert(IsApxExtendedEvexInstruction(ins));
+        // R-bit is added in bit-inverted form.
+        return code & 0xFFDFFFFFFFFFFFFFULL;
+    }
     else if (TakesRex2Prefix(id))
     {
         assert(IsRex2EncodableInstruction(ins));
         return code |= 0xD50100000000ULL; // REX2.B3
     }
-#endif // TARGET_AMD64
 
     return code | 0x4100000000ULL;
 }
@@ -2207,7 +2657,7 @@ bool isPrefix(BYTE b)
 //
 emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) const
 {
-    assert(IsEvexEncodableInstruction(ins));
+    assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
 
     code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
     code &= 0x00000000FFFFFFFFLL;
@@ -2233,6 +2683,14 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
                 case 0x66:
                 {
                     // None of the existing BMI instructions should be EVEX encoded.
+                    // After APX, BMI instructions can be EVEX encoded with NF feature.
+                    if (IsBMIInstruction(ins))
+                    {
+                        // if BMI instructions reaches this part, then it should be APX-EVEX.
+                        // although the opcode of all the BMI instructions are defined with 0x66,
+                        // but it should not, skip this check.
+                        break;
+                    }
                     assert(!IsBMIInstruction(ins));
                     evexPrefix |= (0x01 << 8);
                     break;
@@ -2298,6 +2756,12 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
 
         case 0x0F:
         {
+            if (((evexPrefix >> 16) & 0x07) == 0x04)
+            {
+                // MAP index equal to 4 indicates this instruction is a promoted legacy instruction.
+                // the MAP ID has been set when EVEX prefix is added.
+                break;
+            }
             evexPrefix |= (0x01 << 16);
             break;
         }
@@ -2758,6 +3222,11 @@ unsigned emitter::emitGetRexPrefixSize(instrDesc* id, instruction ins)
         return 0;
     }
 
+    if (TakesApxExtendedEvexPrefix(id))
+    {
+        return 0;
+    }
+
     if (TakesRex2Prefix(id))
     {
         return 0;
@@ -2868,10 +3337,20 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
         adjustedSize++;
     }
 #ifdef TARGET_AMD64
-    else if (IsRex2EncodableInstruction(ins))
+    else if (IsRex2EncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
     {
         unsigned prefixAdjustedSize = 0;
-        if (TakesRex2Prefix(id))
+        if (TakesApxExtendedEvexPrefix(id))
+        {
+            prefixAdjustedSize = 4;
+            // If the opcode will be prefixed by EVEX, then all the map-1-legacy instructions can remove the escape
+            // prefix
+            if (IsLegacyMap1(code))
+            {
+                prefixAdjustedSize -= 1;
+            }
+        }
+        else if (TakesRex2Prefix(id))
         {
             prefixAdjustedSize = 2;
             // If the opcode will be prefixed by REX2, then all the map-1-legacy instructions can remove the escape
@@ -2882,15 +3361,14 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
             }
         }
 
-        adjustedSize = prefixAdjustedSize;
-
         emitAttr attr = id->idOpSize();
-
-        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
+        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx) && !TakesApxExtendedEvexPrefix(id))
         {
             // Most 16-bit operand instructions will need a 0x66 prefix.
-            adjustedSize++;
+            prefixAdjustedSize++;
         }
+
+        adjustedSize = prefixAdjustedSize;
     }
 #endif // TARGET_AMD64
     else
@@ -2942,6 +3420,14 @@ unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexP
 
     if (includeRexPrefixSize && hasRexPrefix(code))
     {
+        if (instrIsExtendedReg3opImul(id->idIns()) && TakesApxExtendedEvexPrefix(id))
+        {
+            // there is a special case when calculating the size of IMUL with APX-EVEX,
+            // IMUL_08 or beyond will have a built-in REX prefix with its opcode,
+            // so it will hit this branch, but when IMUL is encoded with APX-EVEX,
+            // the size of REX is included in the prefix size, where should be calculated outside.
+            return 0;
+        }
         return 1;
     }
 
@@ -3583,7 +4069,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi
     {
         // We are assuming that we only use/encode SPL, BPL, SIL and DIL
         // not the corresponding AH, CH, DH, or BH
-        *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX
+        *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX
     }
 #endif // TARGET_AMD64
 
@@ -3623,7 +4109,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi
         }
         if (false /*reg >= REG_R16 && reg <= REG_R31*/)
         {
-            // seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`.
+            // Seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`.
             assert(TakesRex2Prefix(id));
             *code |= 0x004000000000ULL; // REX2.R4
         }
@@ -3632,7 +4118,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi
     {
         // We are assuming that we only use/encode SPL, BPL, SIL and DIL
         // not the corresponding AH, CH, DH, or BH
-        *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX
+        *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX
     }
 #endif // TARGET_AMD64
 
@@ -3652,7 +4138,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber
     instruction ins = id->idIns();
 
     assert(reg < REG_STK);
-    assert(IsVexOrEvexEncodableInstruction(ins));
+    assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
     assert(hasVexOrEvexPrefix(code));
 
     // Get 4-bit register encoding
@@ -3699,6 +4185,25 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber
             return code ^ regBits;
         }
     }
+    else
+    {
+        assert(TakesApxExtendedEvexPrefix(id));
+        assert(hasEvexPrefix(code));
+#if defined(TARGET_AMD64)
+        // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case.
+        // Rather see these paths cleaned up.
+        regBits = HighAwareRegEncoding(reg);
+
+        if (false /*reg >= REG_R16 && reg <= REG_R31*/)
+        {
+            // Have to set the EVEX V' bit
+            code = AddEvexVPrimePrefix(code);
+        }
+#endif
+        // Shift count = 5-bytes of opcode + 0-2 bits for EVEX
+        regBits <<= 43;
+        return code ^ regBits;
+    }
 
     return code ^ regBits;
 }
@@ -3734,7 +4239,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod
         }
         if (false /*reg >= REG_R16 && reg <= REG_R31*/)
         {
-            // seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`.
+            // Separate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`.
             assert(TakesRex2Prefix(id));
             *code |= 0x002000000000ULL; // REX2.X4
         }
@@ -4130,7 +4635,9 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 
     if ((code & 0xFF00) != 0)
     {
-        sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(id, code, includeRexPrefixSize) : 5;
+        sz += (IsAvx512OrPriorInstruction(ins) || TakesApxExtendedEvexPrefix(id))
+                  ? emitInsSize(id, code, includeRexPrefixSize)
+                  : 5;
     }
     else
     {
@@ -4258,7 +4765,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
                 assert(emitComp->lvaTempsHaveLargerOffsetThanVars());
 
                 // Check whether we can use compressed displacement if EVEX.
-                if (TakesEvexPrefix(id))
+                if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
                 {
                     bool compressedFitsInByte = false;
                     TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte);
@@ -4302,7 +4809,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
 #endif // !FEATURE_FIXED_OUT_ARGS
 
     bool useSmallEncoding = false;
-    if (TakesEvexPrefix(id))
+    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
     {
         TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding);
     }
@@ -4469,7 +4976,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
     }
     else
     {
-        if (TakesEvexPrefix(id))
+        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
         {
             dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
         }
@@ -5414,17 +5921,37 @@ void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommo
 //    attr - the instruction operand size
 //    dst - the destination and first source operand
 //    src - the second source operand
+//    targetReg - target register of this binary node (only used for APX-NDD form)
 //
 // Assumptions:
 //  i) caller of this routine needs to call genConsumeReg()
 // ii) caller of this routine needs to call genProduceReg()
-regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src)
+regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg)
 {
     // We can only have one memory operand and only src can be a constant operand
     // However, the handling for a given operand type (mem, cns, or other) is fairly
     // consistent regardless of whether they are src or dst. As such, we will find
     // the type of each operand and only check them against src/dst where relevant.
 
+    const bool useNDD = UsePromotedEVEXEncoding() && (targetReg != REG_NA);
+#if !defined(TARGET_AMD64)
+    // APX does not support 32-bit system.
+    assert(!useNDD);
+#else
+    if (useNDD)
+    {
+        assert(IsApxNDDEncodableInstruction(ins));
+        // targetReg has to be an actual register if using NDD.
+        assert(targetReg < REG_STK);
+        // make sure target register is not either of the src registers.
+        assert(dst->isUsedFromReg());
+        regNumber dstreg = dst->GetRegNum();
+        regNumber srcreg = src->isUsedFromReg() ? src->GetRegNum() : REG_NA;
+        assert(targetReg != dstreg);
+        assert(targetReg != srcreg);
+    }
+#endif
+
     GenTree* memOp   = nullptr;
     GenTree* cnsOp   = nullptr;
     GenTree* otherOp = nullptr;
@@ -5436,6 +5963,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
         assert(dst->isUsedFromMemory() || (dst->GetRegNum() == REG_NA) || instrIs3opImul(ins));
         assert(!src->isUsedFromMemory());
 
+        // APX code cannot hit this path.
+        assert(!useNDD);
+
         memOp = dst;
 
         if (src->isContained())
@@ -5543,6 +6073,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
                         assert(otherOp == nullptr);
                         assert(src->IsCnsIntOrI());
 
+                        // APX code cannot hit this path.
+                        assert(!useNDD);
+
                         id = emitNewInstrAmdCns(attr, memIndir->Offset(), (int)src->AsIntConCommon()->IconValue());
                     }
                     else
@@ -5560,6 +6093,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
                     assert(id != nullptr);
 
                     id->idIns(ins); // Set the instruction.
+                    if (useNDD)
+                    {
+                        assert(memOp == src);
+                        id->idReg1(targetReg);
+                        id->idReg2(dst->GetRegNum());
+                        id->idSetEvexNdContext();
+                    }
 
                     // Determine the instruction format
                     insFormat fmt = IF_NONE;
@@ -5575,12 +6115,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
                         }
                         else
                         {
-                            fmt = emitInsModeFormat(ins, IF_RRD_ARD);
+                            fmt = useNDD ? emitInsModeFormat(ins, IF_RWR_RRD_ARD) : emitInsModeFormat(ins, IF_RRD_ARD);
                         }
                     }
                     else
                     {
                         assert(memOp == dst);
+                        assert(!useNDD);
 
                         if (cnsOp != nullptr)
                         {
@@ -5619,6 +6160,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
                     else
                     {
                         assert(memOp == dst);
+                        assert(!useNDD);
 
                         if (cnsOp != nullptr)
                         {
@@ -5641,7 +6183,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
                     dispIns(id);
                     emitCurIGsize += sz;
 
-                    return (memOp == src) ? dst->GetRegNum() : REG_NA;
+                    return (memOp == src) ? (useNDD ? targetReg : dst->GetRegNum()) : REG_NA;
                 }
             }
         }
@@ -5689,15 +6231,24 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
             }
             else
             {
-                // src is a stack based local variable
-                // dst is a register
-                emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset);
+                if (useNDD)
+                {
+                    emitIns_R_R_S(ins, attr, targetReg, dst->GetRegNum(), varNum, offset, INS_OPTS_EVEX_nd);
+                    return targetReg;
+                }
+                else
+                {
+                    // src is a stack based local variable
+                    // dst is a register
+                    emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset);
+                }
             }
         }
         else
         {
             assert(memOp == dst);
             assert((dst->GetRegNum() == REG_NA) || dst->IsRegOptional());
+            assert(!useNDD);
 
             if (cnsOp != nullptr)
             {
@@ -5729,10 +6280,20 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
         {
             assert(!dst->isContained());
             GenTreeIntConCommon* intCns = src->AsIntConCommon();
-            emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue());
+
+            if (useNDD)
+            {
+                emitIns_R_R_I(ins, attr, targetReg, dst->GetRegNum(), (int)intCns->IconValue(), INS_OPTS_EVEX_nd);
+                return targetReg;
+            }
+            else
+            {
+                emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue());
+            }
         }
         else
         {
+            assert(!useNDD);
             assert(src->IsCnsFltOrDbl());
             GenTreeDblCon* dblCns = src->AsDblCon();
 
@@ -5751,7 +6312,15 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
         }
         else
         {
-            emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum());
+            if (useNDD)
+            {
+                emitIns_R_R_R(ins, attr, targetReg, dst->GetRegNum(), src->GetRegNum(), INS_OPTS_EVEX_nd);
+                return targetReg;
+            }
+            else
+            {
+                emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum());
+            }
         }
     }
 
@@ -5902,7 +6471,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI
  *  Add an instruction referencing a single register.
  */
 
-void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
+void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions /* = INS_OPTS_NONE */)
 {
     emitAttr size = EA_SIZE(attr);
 
@@ -5978,6 +6547,8 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
     id->idInsFmt(fmt);
     id->idReg1(reg);
 
+    SetEvexNfIfNeeded(id, instOptions);
+
     // Vex bytes
     sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins)));
 
@@ -6050,10 +6621,11 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu
  *  Add an instruction referencing a register and a constant.
  */
 
-void emitter::emitIns_R_I(instruction ins,
-                          emitAttr    attr,
-                          regNumber   reg,
-                          ssize_t val DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags))
+void emitter::emitIns_R_I(instruction         ins,
+                          emitAttr            attr,
+                          regNumber           reg,
+                          ssize_t             val,
+                          insOpts instOptions DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags))
 {
     emitAttr size = EA_SIZE(attr);
 
@@ -6193,6 +6765,8 @@ void emitter::emitIns_R_I(instruction ins,
     id->idDebugOnlyInfo()->idMemCookie = targetHandle;
 #endif
 
+    SetEvexNfIfNeeded(id, instOptions);
+
     if (isSimdInsAndValInByte)
     {
         bool includeRexPrefixSize = true;
@@ -6206,8 +6780,14 @@ void emitter::emitIns_R_I(instruction ins,
 
         sz += emitInsSize(id, insCodeMI(ins), includeRexPrefixSize);
     }
-
     sz += emitGetAdjustedSize(id, insCodeMI(ins));
+#ifdef TARGET_AMD64
+    if (reg == REG_EAX && !instrIs3opImul(ins) && TakesApxExtendedEvexPrefix(id))
+    {
+        // ACC form is not promoted into EVEX space, need to emit with MI form.
+        sz += 1;
+    }
+#endif // TARGET_AMD64
 
     // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a
     // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target
@@ -6981,6 +7561,14 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
     id->idReg1(reg1);
     id->idReg2(reg2);
 
+    SetEvexNdIfNeeded(id, instOptions);
+    SetEvexNfIfNeeded(id, instOptions);
+
+    if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+    {
+        id->idInsFmt(IF_RWR_RRD);
+    }
+
     if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
     {
         // if EVEX.b needs to be set in this path, then it should be embedded rounding.
@@ -7034,6 +7622,30 @@ void emitter::emitIns_R_R_I(
 
     assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0);
     SetEvexEmbMaskIfNeeded(id, instOptions);
+    SetEvexNdIfNeeded(id, instOptions);
+
+    if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+    {
+        // need to fix the instruction opcode for legacy instructions as they has different opcode for RI form.
+        code = insCodeMI(ins);
+        // need to fix the instructions format for NDD legacy instructions.
+        insFormat fmt;
+        switch (ins)
+        {
+            case INS_shl_N:
+            case INS_shr_N:
+            case INS_sar_N:
+            case INS_ror_N:
+            case INS_rol_N:
+                fmt = IF_RWR_RRD_SHF;
+                break;
+
+            default:
+                fmt = IF_RWR_RRD_CNS;
+                break;
+        }
+        id->idInsFmt(fmt);
+    }
 
     UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival);
     id->idCodeSize(sz);
@@ -7045,7 +7657,7 @@ void emitter::emitIns_R_R_I(
 void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs)
 {
     assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta ||
-           ins == INS_inc || ins == INS_dec);
+           ins == INS_inc || ins == INS_dec || ins == INS_inc_no_evex || ins == INS_dec_no_evex);
 
     instrDesc* id = emitNewInstrAmd(attr, offs);
 
@@ -7398,8 +8010,8 @@ void emitter::emitIns_R_R_C(instruction          ins,
 void emitter::emitIns_R_R_R(
     instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, insOpts instOptions)
 {
-    assert(IsAvx512OrPriorInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins));
 
     instrDesc* id = emitNewInstr(attr);
     id->idIns(ins);
@@ -7415,6 +8027,14 @@ void emitter::emitIns_R_R_R(
         id->idSetEvexbContext(instOptions);
     }
     SetEvexEmbMaskIfNeeded(id, instOptions);
+    SetEvexNdIfNeeded(id, instOptions);
+    SetEvexNfIfNeeded(id, instOptions);
+
+    if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+    {
+        // need to fix the instructions format for NDD legacy instructions.
+        id->idInsFmt(IF_RWR_RRD_RRD);
+    }
 
     UNATIVE_OFFSET sz = emitInsSizeRR(id, insCodeRM(ins));
     id->idCodeSize(sz);
@@ -7426,8 +8046,8 @@ void emitter::emitIns_R_R_R(
 void emitter::emitIns_R_R_S(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions)
 {
-    assert(IsAvx512OrPriorInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins) || IsApxExtendedEvexInstruction(ins));
 
     instrDesc* id = emitNewInstr(attr);
 
@@ -7439,6 +8059,12 @@ void emitter::emitIns_R_R_S(
 
     SetEvexBroadcastIfNeeded(id, instOptions);
     SetEvexEmbMaskIfNeeded(id, instOptions);
+    SetEvexNdIfNeeded(id, instOptions);
+
+    if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+    {
+        id->idInsFmt(IF_RWR_RRD_SRD);
+    }
 
 #ifdef DEBUG
     id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
@@ -9689,6 +10315,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int va
 
     SetEvexBroadcastIfNeeded(id, instOptions);
     SetEvexEmbMaskIfNeeded(id, instOptions);
+    SetEvexNfIfNeeded(id, instOptions);
 
     UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
     id->idCodeSize(sz);
@@ -11393,6 +12020,13 @@ void emitter::emitDispEmbRounding(instrDesc* id) const
     {
         return;
     }
+
+    if (IsApxExtendedEvexInstruction(id->idIns()))
+    {
+        // Apx-Evex.nd shared the same bit(s) with Evex.b,
+        // for ndd case, we don't need to display any thing special.
+        return;
+    }
     assert(!id->idHasMem());
     unsigned roundingMode = id->idGetEvexbContext();
     if (roundingMode == 1)
@@ -11573,6 +12207,14 @@ void emitter::emitDispIns(
 
     /* Display the instruction name */
 
+#ifdef TARGET_AMD64
+    if (IsApxNFEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet())
+    {
+        // print the EVEX.NF indication in psudeo prefix style.
+        printf("{nf}    ");
+    }
+#endif // TARGET_AMD64
+
     sstr = codeGen->genInsDisplayName(id);
     printf(" %-9s", sstr);
 
@@ -12323,6 +12965,20 @@ void emitter::emitDispIns(
                     break;
                 }
 
+                case INS_rol:
+                case INS_ror:
+                case INS_rcl:
+                case INS_rcr:
+                case INS_shl:
+                case INS_shr:
+                case INS_sar:
+                {
+                    printf("%s", emitRegName(id->idReg1(), attr));
+                    printf(", %s", emitRegName(id->idReg2(), attr));
+                    emitDispShift(ins, (BYTE)0);
+                    break;
+                }
+
                 default:
                 {
                     printf("%s", emitRegName(id->idReg1(), attr));
@@ -12340,8 +12996,8 @@ void emitter::emitDispIns(
         case IF_RRW_RRD_RRD:
         case IF_RWR_RWR_RRD:
         {
-            assert(IsVexOrEvexEncodableInstruction(ins));
-            assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
+            assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+            assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins));
 
             regNumber reg2 = id->idReg2();
             regNumber reg3 = id->idReg3();
@@ -12568,6 +13224,19 @@ void emitter::emitDispIns(
             break;
         }
 
+        case IF_RWR_RRD_SHF:
+        {
+            assert(IsApxExtendedEvexInstruction(id->idIns()));
+            printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr));
+
+            emitGetInsCns(id, &cnsVal);
+            val = cnsVal.cnsVal;
+
+            emitDispShift(ins, (BYTE)val);
+
+            break;
+        }
+
         case IF_RRD_MRD:
         case IF_RWR_MRD:
         case IF_RRW_MRD:
@@ -13516,12 +14185,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 break;
 
             case EA_2BYTE:
-
-                /* Output a size prefix for a 16-bit operand */
-
-                dst += emitOutputByte(dst, 0x66);
-
+            {
+                // Output a size prefix for a 16-bit operand
+                if (TakesApxExtendedEvexPrefix(id))
+                {
+                    assert(IsApxExtendedEvexInstruction(ins));
+                    assert(hasEvexPrefix(code));
+                    // Evex.pp should already be added when adding the prefix.
+                    assert((code & EXTENDED_EVEX_PP_BITS) != 0);
+                }
+                else
+                {
+                    dst += emitOutputByte(dst, 0x66);
+                }
                 FALLTHROUGH;
+            }
 
             case EA_4BYTE:
 #ifdef TARGET_AMD64
@@ -13565,7 +14243,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
     else
     {
-        if (TakesEvexPrefix(id))
+        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
         {
             dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
         }
@@ -14103,6 +14781,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
                 break;
 
+            case IF_RWR_RRD_ARD:
+                assert(((id->idGCref() == GCT_BYREF) &&
+                        (ins == INS_add || ins == INS_sub || ins == INS_sub_hide || insIsCMOV(ins))) ||
+                       ((id->idGCref() == GCT_GCREF) && insIsCMOV(ins)));
+                assert(id->idIsEvexNdContextSet());
+                emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
+                break;
+
             case IF_ARD_RRD:
             case IF_AWR_RRD:
                 break;
@@ -14349,25 +15035,45 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         switch (size)
         {
             case EA_1BYTE:
+#ifdef TARGET_AMD64
+                assert((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex));
+#endif // TARGET_AMD64
                 break;
 
             case EA_2BYTE:
                 // Output a size prefix for a 16-bit operand
-                dst += emitOutputByte(dst, 0x66);
+                {
+                    if (!TakesApxExtendedEvexPrefix(id))
+                    {
+                        dst += emitOutputByte(dst, 0x66);
+                    }
+                }
                 FALLTHROUGH;
 
             case EA_4BYTE:
+                code |= 0x01;
+                break;
+
 #ifdef TARGET_AMD64
             case EA_8BYTE:
-#endif // TARGET_AMD64
-
                 /* Set the 'w' size bit to indicate 32-bit operation
                  * Note that incrementing "code" for INS_call (0xFF) would
                  * overflow, whereas setting the lower bit to 1 just works out
                  */
-
-                code |= 0x01;
-                break;
+                {
+                    if (TakesApxExtendedEvexPrefix(id))
+                    {
+                        assert(hasEvexPrefix(code));
+                        code = AddRexWPrefix(id, code);
+                    }
+                    if ((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex))
+                    // These instructions do not support 1-byte inputs and the opcode is exact.
+                    {
+                        code |= 0x01;
+                    }
+                    break;
+                }
+#endif // TARGET_AMD64
 
 #ifdef TARGET_X86
             case EA_8BYTE:
@@ -14401,7 +15107,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     // function, to which the remainder of the emitter logic should handle properly.
     // TODO-XARCH-AVX512 : embedded broadcast might change this
     int dspAsByte = dsp;
-    if (TakesEvexPrefix(id))
+    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
     {
         dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
     }
@@ -14455,7 +15161,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
         // function, to which the remainder of the emitter logic should handle properly.
         // TODO-XARCH-AVX512 : embedded broadcast might change this
-        if (TakesEvexPrefix(id))
+        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
         {
             dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
         }
@@ -14602,6 +15308,15 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
                 break;
 
+            case IF_RWR_RRD_SRD: // Register Read/Write, Stack Read (So we need to update GC live for register)
+
+                // reg could have been a GCREF as GCREF + int=BYREF
+                //                             or BYREF+/-int=BYREF
+                assert(id->idGCref() == GCT_BYREF && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide));
+                assert(id->idIsEvexNdContextSet());
+                emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
+                break;
+
             case IF_SRW_CNS:
             case IF_SRW_RRD:
             case IF_SRW_RRW:
@@ -15187,7 +15902,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
 
                 // Can't use the compact form, use the long form
                 ins = (instruction)(ins + 1);
-                if (size == EA_2BYTE)
+                if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
                 {
                     // Output a size prefix for a 16-bit operand
                     dst += emitOutputByte(dst, 0x66);
@@ -15200,10 +15915,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
                     code |= 0x1;
                 }
 
-                if (TakesRex2Prefix(id))
-                {
-                    code = AddRex2Prefix(ins, code);
-                }
+                code = AddX86PrefixIfNeeded(id, code, size);
 
                 if (TakesRexWPrefix(id))
                 {
@@ -15338,23 +16050,22 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
         default:
 
             assert(id->idGCref() == GCT_NONE);
-
-            code = insEncodeMRreg(id, reg, size, insCodeMR(ins));
+            code = insCodeMR(ins);
+            code = AddX86PrefixIfNeeded(id, code, size);
+            code = insEncodeMRreg(id, reg, size, code);
 
             if (size != EA_1BYTE)
             {
                 // Set the 'w' bit to get the large version
                 code |= 0x1;
 
-                if (size == EA_2BYTE)
+                if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
                 {
                     // Output a size prefix for a 16-bit operand
                     dst += emitOutputByte(dst, 0x66);
                 }
             }
 
-            code = AddX86PrefixIfNeeded(id, code, size);
-
             if (TakesRexWPrefix(id))
             {
                 code = AddRexWPrefix(id, code);
@@ -15491,7 +16202,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     }
 #ifdef FEATURE_HW_INTRINSICS
     else if ((ins == INS_bsf) || (ins == INS_bsr) || (ins == INS_crc32) || (ins == INS_lzcnt) || (ins == INS_popcnt) ||
-             (ins == INS_tzcnt))
+             (ins == INS_tzcnt)
+#ifdef TARGET_AMD64
+             || (ins == INS_lzcnt_evex) || (ins == INS_tzcnt_evex) || (ins == INS_popcnt_evex)
+#endif // TARGET_AMD64
+    )
     {
         assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins));
         code = insCodeRM(ins);
@@ -15502,7 +16217,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
             code |= 0x0100;
         }
 
-        if (size == EA_2BYTE)
+        if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
         {
             assert(ins == INS_crc32);
             dst += emitOutputByte(dst, 0x66);
@@ -15515,15 +16230,21 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
 #endif // FEATURE_HW_INTRINSICS
     else
     {
+        // TODO-XArch-APX:
+        // some instructions with NDD form might go into this path with EVEX prefix.
+        // might consider having a separate path with checks like: TakesApxExtendedEvexPrefix
+        // essentially, we need to make it clear on the priority and necessity of REX2 and EVEX:
+        // REX2 is needed iff EGPRs are involved.
+        // EVEX is needed when NDD, NF or other features are involved.
+        // So the logic should be:
+        // checking if those new features are used, then check if EGPRs are involved.
+        // EGPRs will be supported by EVEX anyway, so don't need to check in the first place.
         assert(!TakesSimdPrefix(id));
         code = insCodeMR(ins);
-        if (TakesRex2Prefix(id))
-        {
-            code = AddRex2Prefix(ins, code);
-        }
+        code = AddX86PrefixIfNeeded(id, code, size);
         code = insEncodeMRreg(id, code);
 
-        if (ins != INS_test)
+        if (ins != INS_test && !IsShiftInstruction(ins))
         {
             code |= 2;
         }
@@ -15537,7 +16258,17 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
 
             case EA_2BYTE:
                 // Output a size prefix for a 16-bit operand
-                dst += emitOutputByte(dst, 0x66);
+                if (TakesApxExtendedEvexPrefix(id))
+                {
+                    assert(IsApxExtendedEvexInstruction(ins));
+                    assert(hasEvexPrefix(code));
+                    // Evex.pp should already be added when adding the prefix.
+                    assert((code & EXTENDED_EVEX_PP_BITS) != 0);
+                }
+                else
+                {
+                    dst += emitOutputByte(dst, 0x66);
+                }
                 FALLTHROUGH;
 
             case EA_4BYTE:
@@ -15588,8 +16319,18 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
         }
     }
 
-    unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code);
-    regCode |= insEncodeReg012(id, regFor012Bits, size, &code);
+    unsigned regCode;
+    if (!id->idIsEvexNdContextSet() || !IsApxNDDEncodableInstruction(ins))
+    {
+        regCode = insEncodeReg345(id, regFor345Bits, size, &code);
+        regCode |= insEncodeReg012(id, regFor012Bits, size, &code);
+    }
+    else
+    {
+        // unary ins with NDD form use Evex.vvvvv for dst, and ModRM.rm for src
+        code    = insEncodeReg3456(id, reg1, size, code);
+        regCode = insEncodeReg012(id, reg2, size, &code);
+    }
 
     if (TakesSimdPrefix(id))
     {
@@ -15647,6 +16388,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
         dst += emitOutputByte(dst, (code >> 8) & 0xFF);
         dst += emitOutputByte(dst, (0xC0 | regCode));
     }
+    else if (IsApxNDDEncodableInstruction(ins) && id->idIsEvexNdContextSet())
+    {
+        dst += emitOutputByte(dst, (code & 0xFF));
+        dst += emitOutputByte(dst, (0xC0 | regCode | (code >> 8)));
+    }
     else
     {
         dst += emitOutputWord(dst, code);
@@ -15656,155 +16402,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     // Does this instruction operate on a GC ref value?
     if (id->idGCref())
     {
-        switch (id->idInsFmt())
-        {
-            case IF_RRD_RRD:
-                break;
-
-            case IF_RWR_RRD:
-            {
-                if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0)
-                {
-                    // We're relocating "this" in the prolog
-                    assert(emitComp->lvaIsOriginalThisArg(0));
-                    assert(emitComp->lvaTable[0].lvRegister);
-                    assert(emitComp->lvaTable[0].GetRegNum() == reg1);
-
-                    if (emitFullGCinfo)
-                    {
-                        emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true);
-                        break;
-                    }
-                    else
-                    {
-                        /* If emitFullGCinfo==false, the we don't use any
-                           regPtrDsc's and so explicitly note the location
-                           of "this" in GCEncode.cpp
-                         */
-                    }
-                }
-
-                emitGCregLiveUpd(id->idGCref(), reg1, dst);
-                break;
-            }
-
-            case IF_RRW_RRD:
-            {
-                switch (id->idIns())
-                {
-                    /*
-                        This must be one of the following cases:
-
-                        xor reg, reg        to assign NULL
-
-                        and r1 , r2         if (ptr1 && ptr2) ...
-                        or  r1 , r2         if (ptr1 || ptr2) ...
-
-                        add r1 , r2         to compute a normal byref
-                        sub r1 , r2         to compute a strange byref (VC only)
-
-                    */
-                    case INS_xor:
-                        assert(reg1 == reg2);
-                        emitGCregLiveUpd(id->idGCref(), reg1, dst);
-                        break;
-
-                    case INS_or:
-                    case INS_and:
-                        emitGCregDeadUpd(reg1, dst);
-                        break;
-
-                    case INS_add:
-                    case INS_sub:
-                    case INS_sub_hide:
-                        assert(id->idGCref() == GCT_BYREF);
-
-#if 0
-#ifdef DEBUG
-                        // Due to elided register moves, we can't have the following assert.
-                        // For example, consider:
-                        //    t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
-                        //        /--*  t85    byref
-                        //        *  STORE_LCL_VAR byref  V40 tmp31 rdx REG rdx
-                        // Here, V01 is type `long` on entry, then is stored as a byref. But because
-                        // the register allocator assigned the same register, no instruction was
-                        // generated, and we only (currently) make gcref/byref changes in emitter GC info
-                        // when an instruction is generated. We still generate correct GC info, as this
-                        // instruction, if writing a GC ref even through reading a long, will go live here.
-                        // These situations typically occur due to unsafe casting, such as with Span<T>.
-
-                        regMaskTP regMask;
-                        regMask = genRegMask(reg1) | genRegMask(reg2);
-
-                        // r1/r2 could have been a GCREF as GCREF + int=BYREF
-                        //                               or BYREF+/-int=BYREF
-                        assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) ||
-                               ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)));
-#endif // DEBUG
-#endif // 0
-
-                        // Mark r1 as holding a byref
-                        emitGCregLiveUpd(GCT_BYREF, reg1, dst);
-                        break;
-
-                    default:
-#ifdef DEBUG
-                        emitDispIns(id, false, false, false);
-#endif
-                        assert(!"unexpected GC reg update instruction");
-                }
-
-                break;
-            }
-
-            case IF_RRW_RRW:
-            {
-                // This must be "xchg reg1, reg2"
-                assert(id->idIns() == INS_xchg);
-
-                // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC
-                // register pointer mask.
-
-                GCtype gc1, gc2;
-
-                gc1 = emitRegGCtype(reg1);
-                gc2 = emitRegGCtype(reg2);
-
-                if (gc1 != gc2)
-                {
-                    // Kill the GC-info about the GC registers
-
-                    if (needsGC(gc1))
-                    {
-                        emitGCregDeadUpd(reg1, dst);
-                    }
-
-                    if (needsGC(gc2))
-                    {
-                        emitGCregDeadUpd(reg2, dst);
-                    }
-
-                    // Now, swap the info
-
-                    if (needsGC(gc1))
-                    {
-                        emitGCregLiveUpd(gc1, reg2, dst);
-                    }
-
-                    if (needsGC(gc2))
-                    {
-                        emitGCregLiveUpd(gc2, reg1, dst);
-                    }
-                }
-                break;
-            }
-
-            default:
-#ifdef DEBUG
-                emitDispIns(id, false, false, false);
-#endif
-                assert(!"unexpected GC ref instruction format");
-        }
+        emitHandleGCrefRegs(dst, id);
     }
     else
     {
@@ -15849,8 +16447,9 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     code_t code;
 
     instruction ins = id->idIns();
-    assert(IsVexOrEvexEncodableInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins));
+    assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins) ||
+           IsApxExtendedEvexInstruction(ins));
     regNumber targetReg = id->idReg1();
     regNumber src1      = id->idReg2();
     regNumber src2      = id->idReg3();
@@ -15859,6 +16458,51 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     code = insCodeRM(ins);
     code = AddX86PrefixIfNeeded(id, code, size);
 
+    if (IsApxExtendedEvexInstruction(ins) && !IsBMIInstruction(ins))
+    {
+        // TODO-XArch-apx:
+        // For rm-like operand encoding instructions:
+        // legacy promoted EVEX encoding has introduced different semantic:
+        // op1 - vvvvv
+        // op2 - MODRM.REG
+        // op3 - MODRM.R/M
+        regNumber tmp = src1;
+        src1          = targetReg;
+        targetReg     = tmp;
+
+        switch (size)
+        {
+            case EA_1BYTE:
+                // TODO-APX : verify  We should never end up here. Atleast for instructions I have looked into, we
+                // promote to int to do operation
+                noway_assert(RBM_BYTE_REGS & genRegMask(src1));
+                noway_assert(RBM_BYTE_REGS & genRegMask(src2));
+                noway_assert(RBM_BYTE_REGS & genRegMask(targetReg));
+                break;
+
+            case EA_2BYTE:
+            case EA_4BYTE:
+                // Set the 'w' bit to get the large version
+                code = insIsCMOV(ins) ? code : (code | (0x01));
+                break;
+
+#ifdef TARGET_AMD64
+            case EA_8BYTE:
+                // TODO-AMD64-CQ: Better way to not emit REX.W when we don't need it
+                // Don't need to zero out the high bits explicitly
+                code = AddRexWPrefix(id, code); // TODO-APX : Revisit. does xor or other cases need to be handled
+                                                // differently? see emitOutputRR
+                // Set the 'w' bit to get the large version
+                code = insIsCMOV(ins) ? code : (code | (0x01));
+                break;
+
+#endif // TARGET_AMD64
+
+            default:
+                assert(!"unexpected size");
+        }
+    }
+
     code = insEncodeRMreg(id, code);
 
     if (TakesRexWPrefix(id))
@@ -15906,7 +16550,10 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
         dst += emitOutputByte(dst, (0xC0 | regCode));
     }
 
-    noway_assert(!id->idGCref());
+    if (id->idGCref())
+    {
+        emitHandleGCrefRegs(dst, id);
+    }
 
     if (!emitInsCanOnlyWriteSSE2OrAVXReg(id))
     {
@@ -16089,6 +16736,12 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
                 useACC    = true;
             }
         }
+
+        if (TakesApxExtendedEvexPrefix(id))
+        {
+            // ACC form does not have support for promoted EVEX.
+            useACC = false;
+        }
     }
     else
     {
@@ -16144,7 +16797,10 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
 
         case EA_2BYTE:
             // Output a size prefix for a 16-bit operand
-            dst += emitOutputByte(dst, 0x66);
+            if (!TakesApxExtendedEvexPrefix(id))
+            {
+                dst += emitOutputByte(dst, 0x66);
+            }
             FALLTHROUGH;
 
         case EA_4BYTE:
@@ -16822,7 +17478,23 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const
 //
 ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte)
 {
-    assert(TakesEvexPrefix(id));
+    assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id));
+
+    if (!hasTupleTypeInfo(id->idIns()))
+    {
+        // After APX, some instructions with APX features will be promoted
+        // to APX-EVEX, we will re-use the existing displacement emitting
+        // path, but for those instructions with no tuple information,
+        // APX-EVEX treat the scaling factor to be 1 constantly.
+        instruction ins = id->idIns();
+        // TODO-XArch-APX:
+        // This assert may need tweak if BMI1 instructions are promoted
+        // into EVEX for multiple features, currently only EVEX.NF.
+        assert(IsApxExtendedEvexInstruction(id->idIns()));
+        *dspInByte = ((signed char)dsp == (ssize_t)dsp);
+        return dsp;
+    }
+
     insTupleType tt = insTupleTypeInfo(id->idIns());
     assert(hasTupleTypeInfo(id->idIns()));
 
@@ -17477,7 +18149,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
 
             // Output a size prefix for a 16-bit operand
-            if (size == EA_2BYTE)
+            if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
             {
                 dst += emitOutputByte(dst, 0x66);
             }
@@ -17493,6 +18165,37 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             break;
         }
 
+        case IF_RWR_RRD_SHF:
+        {
+            assert(IsApxExtendedEvexInstruction(ins));
+            code = insCodeMR(ins);
+            code = AddX86PrefixIfNeeded(id, code, size);
+            code = insEncodeMRreg(id, id->idReg2(), size, code);
+            code = insEncodeReg3456(id, id->idReg1(), size, code);
+
+            // set the W bit
+            if (size != EA_1BYTE)
+            {
+                code |= 1;
+            }
+
+            // Emit the REX prefix if it exists
+            if (TakesRexWPrefix(id))
+            {
+                code = AddRexWPrefix(id, code);
+            }
+
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputWord(dst, code);
+            dst += emitOutputByte(dst, emitGetInsSC(id));
+            sz = emitSizeOfInsDsc_CNS(id);
+
+            // Update GC info.
+            assert(!id->idGCref());
+            emitGCregDeadUpd(id->idReg1(), dst);
+            break;
+        }
+
         case IF_RRD_RRD:
         case IF_RWR_RRD:
         case IF_RRW_RRD:
@@ -17566,7 +18269,105 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // Also, determine which operand goes where in the ModRM byte.
             regNumber mReg;
             regNumber rReg;
-            if (hasCodeMR(ins))
+            if (IsApxExtendedEvexInstruction(ins))
+            {
+                assert(hasCodeMI(ins));
+                code              = insCodeMI(ins);
+                code              = AddX86PrefixIfNeeded(id, code, size);
+                code              = insEncodeReg3456(id, id->idReg1(), size, code);
+                mReg              = id->idReg2();
+                code              = insEncodeMIreg(id, mReg, size, code);
+                rReg              = REG_NA;
+                ssize_t val       = emitGetInsSC(id);
+                bool    valInByte = ((signed char)val == (target_ssize_t)val) && (ins != INS_mov) && (ins != INS_test);
+
+                switch (size)
+                {
+                    case EA_1BYTE:
+                        break;
+
+                    case EA_2BYTE:
+                        code |= EXTENDED_EVEX_PP_BITS;
+                        FALLTHROUGH;
+
+                    case EA_4BYTE:
+                        code |= 1;
+                        break;
+
+#ifdef TARGET_AMD64
+                    case EA_8BYTE:
+                        code = AddRexWPrefix(id, code);
+                        code |= 1;
+                        break;
+#endif // TARGET_AMD64
+
+                    default:
+                        assert(!"unexpected size");
+                }
+
+                dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
+
+                if (valInByte && size > EA_1BYTE)
+                {
+                    code |= 2;
+                    dst += emitOutputWord(dst, code);
+                    dst += emitOutputByte(dst, val);
+                }
+                else
+                {
+                    dst += emitOutputWord(dst, code);
+                    switch (size)
+                    {
+                        case EA_1BYTE:
+                            dst += emitOutputByte(dst, val);
+                            break;
+                        case EA_2BYTE:
+                            dst += emitOutputWord(dst, val);
+                            break;
+                        case EA_4BYTE:
+                            dst += emitOutputLong(dst, val);
+                            break;
+#ifdef TARGET_AMD64
+                        case EA_8BYTE:
+                            dst += emitOutputLong(dst, val);
+                            break;
+#endif // TARGET_AMD64
+                        default:
+                            break;
+                    }
+
+                    if (id->idIsCnsReloc())
+                    {
+                        emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW);
+                        assert(size == EA_4BYTE);
+                    }
+                }
+
+                sz = emitSizeOfInsDsc_CNS(id);
+
+                if (!emitInsCanOnlyWriteSSE2OrAVXReg(id))
+                {
+                    emitGCregDeadUpd(id->idReg1(), dst);
+                }
+
+                switch (id->idInsFmt())
+                {
+                    case IF_RWR_RRD_CNS:
+                        assert(!instrIs3opImul(ins));
+
+                        emitGCregDeadUpd(id->idReg1(), dst);
+                        break;
+
+                    default:
+#ifdef DEBUG
+                        emitDispIns(id, false, false, false);
+#endif
+                        assert(!"unexpected GC ref instruction format");
+                }
+
+                break;
+            }
+            else if (hasCodeMR(ins))
             {
                 code = insCodeMR(ins);
                 // Emit the VEX prefix if it exists
@@ -17801,6 +18602,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         {
             code = insCodeRM(ins);
 
+            if (id->idIsEvexNdContextSet() && TakesApxExtendedEvexPrefix(id))
+            {
+                // TODO-XArch-APX:
+                // I'm not sure why instructions on this path can be with instruction
+                // format other than IF_RWR_RRD_ARD, fix here for debug purpose only,
+                // need revisit.
+                id->idInsFmt(IF_RWR_RRD_ARD);
+
+                code    = AddX86PrefixIfNeeded(id, code, size);
+                code    = insEncodeReg3456(id, id->idReg1(), size, code);
+                regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8);
+                dst     = emitOutputAM(dst, id, code | regcode);
+
+                sz = emitSizeOfInsDsc_AMD(id);
+                break;
+            }
+
             if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
             {
                 // Special case 4-byte AVX instructions as the
@@ -18068,7 +18886,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RRW_RRD_SRD:
         case IF_RWR_RWR_SRD:
         {
-            assert(IsVexOrEvexEncodableInstruction(ins));
+            assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+
+            if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+            {
+                // EVEX.vvvv has different semantic for APX-EVEX NDD instructions.
+                code    = insCodeRM(ins);
+                code    = AddX86PrefixIfNeeded(id, code, size);
+                code    = insEncodeReg3456(id, id->idReg1(), size, code);
+                regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8);
+                dst     = emitOutputSV(dst, id, code | regcode);
+                sz      = sizeof(instrDesc);
+                break;
+            }
 
             code = insCodeRM(ins);
             code = AddX86PrefixIfNeeded(id, code, size);
@@ -18895,7 +19725,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
 
         case INS_inc:
+        case INS_inc_no_evex:
         case INS_dec:
+        case INS_dec_no_evex:
         case INS_neg:
         case INS_not:
             if (memFmt == IF_NONE)
@@ -18980,10 +19812,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
 
         case INS_add:
+        case INS_add_no_evex:
         case INS_sub:
         case INS_sub_hide:
         case INS_and:
+        case INS_and_no_evex:
         case INS_or:
+        case INS_or_no_evex:
         case INS_xor:
             if (memFmt == IF_NONE)
             {
@@ -19133,6 +19968,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                     break;
 
                 case IF_RRW:
+                // TODO-XArch-APX: to be verified if this data is correct for NDD form.
+                case IF_RWR_RRD:
                     // ins   reg, cl
                     result.insThroughput = PERFSCORE_THROUGHPUT_2C;
                     result.insLatency    = PERFSCORE_LATENCY_2C;
@@ -19160,6 +19997,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             switch (insFmt)
             {
                 case IF_RRW:
+                // TODO-XArch-APX: to be verified if this data is correct for NDD form.
+                case IF_RWR_RRD:
                     // ins   reg, 1
                     result.insThroughput = PERFSCORE_THROUGHPUT_2X;
                     break;
@@ -19193,6 +20032,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             switch (insFmt)
             {
                 case IF_RRW_SHF:
+                // TODO-XArch-APX: to be verified if this data is correct for NDD form.
+                case IF_RWR_RRD_SHF:
                     // ins   reg, cns
                     result.insThroughput = PERFSCORE_THROUGHPUT_2X;
                     break;
@@ -20128,6 +20969,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_vshuff64x2:
         case INS_vshufi32x4:
         case INS_vshufi64x2:
+#ifdef TARGET_AMD64
+        case INS_popcnt_evex:
+        case INS_lzcnt_evex:
+        case INS_tzcnt_evex:
+#endif // TARGET_AMD64
         {
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             result.insLatency += PERFSCORE_LATENCY_3C;
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 5f820c7c022c2..0558be85e5a58 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -134,11 +134,18 @@ static regNumber getSseShiftRegNumber(instruction ins);
 bool             HasVexEncoding(instruction ins) const;
 bool             HasEvexEncoding(instruction ins) const;
 bool             HasRex2Encoding(instruction ins) const;
+bool             HasApxNdd(instruction ins) const;
+bool             HasApxNf(instruction ins) const;
 bool             IsVexEncodableInstruction(instruction ins) const;
 bool             IsEvexEncodableInstruction(instruction ins) const;
 bool             IsRex2EncodableInstruction(instruction ins) const;
+bool             IsApxNDDEncodableInstruction(instruction ins) const;
+bool             IsApxNFEncodableInstruction(instruction ins) const;
+bool             IsApxExtendedEvexInstruction(instruction ins) const;
+bool             IsShiftInstruction(instruction ins) const;
 bool             IsLegacyMap1(code_t code) const;
 bool             IsVexOrEvexEncodableInstruction(instruction ins) const;
+bool             DoJitUseApxNDD(instruction ins) const;
 
 code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code);
 
@@ -179,6 +186,8 @@ bool AreFlagsSetForSignJumpOpt(regNumber reg, emitAttr opSize, GenCondition cond
 
 insOpts GetEmbRoundingMode(uint8_t mode) const;
 
+void emitHandleGCrefRegs(BYTE* dst, instrDesc* id);
+
 bool hasRexPrefix(code_t code)
 {
 #ifdef TARGET_AMD64
@@ -332,6 +341,18 @@ void SetUseRex2Encoding(bool value)
     useRex2Encodings = value;
 }
 
+// Is Promoted EVEX encoding supported.
+bool usePromotedEVEXEncodings;
+bool UsePromotedEVEXEncoding() const
+{
+    return usePromotedEVEXEncodings;
+}
+
+void SetUsePromotedEVEXEncoding(bool value)
+{
+    usePromotedEVEXEncodings = value;
+}
+
 //------------------------------------------------------------------------
 // UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported
 // contains Evex prefix.
@@ -349,6 +370,7 @@ bool UseSimdEncoding() const
 #define EVEX_PREFIX_CODE 0x6200000000000000ULL
 
 bool TakesEvexPrefix(const instrDesc* id) const;
+bool TakesApxExtendedEvexPrefix(const instrDesc* id) const;
 
 //------------------------------------------------------------------------
 // hasEvexPrefix: Returns true if the instruction encoding already
@@ -405,11 +427,7 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size)
 //
 code_t AddX86PrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size)
 {
-    // TODO-xarch-apx:
-    // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality
-    // of these functions are overlapping.
-
-    if (TakesEvexPrefix(id))
+    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
     {
         return AddEvexPrefix(id, code, size);
     }
@@ -445,7 +463,7 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA
     // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality
     // of these functions are overlapping.
 
-    if (TakesEvexPrefix(id))
+    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
     {
         return !hasEvexPrefix(code) ? AddEvexPrefix(id, code, size) : code;
     }
@@ -511,6 +529,48 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions)
     }
 }
 
+//------------------------------------------------------------------------
+// SetEvexNdIfNeeded: set NDD form - new data destination if needed.
+//
+// Arguments:
+//    id          - instruction descriptor
+//    instOptions - emit options
+//
+void SetEvexNdIfNeeded(instrDesc* id, insOpts instOptions)
+{
+    if ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0)
+    {
+        assert(UsePromotedEVEXEncoding());
+        assert(IsApxNDDEncodableInstruction(id->idIns()));
+        id->idSetEvexNdContext();
+    }
+    else
+    {
+        assert((instOptions & INS_OPTS_EVEX_nd_MASK) == 0);
+    }
+}
+
+//------------------------------------------------------------------------
+// SetEvexNdIfNeeded: set Evex.nf on instrDesc
+//
+// Arguments:
+//    id          - instruction descriptor
+//    instOptions - emit options
+//
+void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions)
+{
+    if ((instOptions & INS_OPTS_EVEX_nf_MASK) != 0)
+    {
+        assert(UsePromotedEVEXEncoding());
+        assert(IsApxNFEncodableInstruction(id->idIns()));
+        id->idSetEvexNfContext();
+    }
+    else
+    {
+        assert((instOptions & INS_OPTS_EVEX_nf_MASK) == 0);
+    }
+}
+
 //------------------------------------------------------------------------
 // AddSimdPrefixIfNeeded: Add the correct SIMD prefix.
 // Check if the prefix already exists befpre adding.
@@ -753,7 +813,7 @@ void emitIns_Data16();
 
 void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val);
 
-void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
+void emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions = INS_OPTS_NONE);
 
 void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs);
 
@@ -762,7 +822,9 @@ void emitIns_A(instruction ins, emitAttr attr, GenTreeIndir* indir);
 void emitIns_R_I(instruction ins,
                  emitAttr    attr,
                  regNumber   reg,
-                 ssize_t val DEBUGARG(size_t targetHandle = 0) DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));
+                 ssize_t     val,
+                 insOpts instOptions = INS_OPTS_NONE DEBUGARG(size_t targetHandle = 0)
+                     DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));
 
 void emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regNumber srgReg, bool canSkip);
 
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index 5ec40ea333973..d52a60fc8db26 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -221,6 +221,12 @@ enum insFlags : uint64_t
     // APX: REX2 prefix:
     Encoding_REX2  = 1ULL << 44,
 
+    // APX: EVEX.ND:
+    INS_Flags_Has_NDD  = 1ULL << 45,    
+    
+    // APX: EVEX.NF:
+    INS_Flags_Has_NF  = 1ULL << 46,
+
     //  TODO-Cleanup:  Remove this flag and its usage from TARGET_XARCH
     INS_FLAGS_DONT_CARE = 0x00ULL,
 };
@@ -259,6 +265,17 @@ enum insOpts: unsigned
     INS_OPTS_EVEX_z_MASK = 0x20,    // mask for EVEX.z related features
 
     INS_OPTS_EVEX_em_zero = 1 << 5, // Embedded mask merges with zero
+
+    // One-bit:  0b0100_0000
+    INS_OPTS_EVEX_nd_MASK = 0x40,   // mask for APX-EVEX.nd related features
+
+    INS_OPTS_EVEX_nd = 1 << 6,      // NDD form for legacy instructions
+
+    // One-bit:  0b1000_0000
+    INS_OPTS_EVEX_nf_MASK = 0x80,   // mask for APX-EVEX.nf related features
+
+    INS_OPTS_EVEX_nf = 1 << 7,      // NDD form for legacy instructions
+
 };
 
 #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index fbc635ab5553b..c48b5f5f1876d 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -58,26 +58,31 @@ INST5(pop,              "pop",              IUM_WR, 0x00008E,     BAD_CODE,
 INST5(push_hide,        "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_TT_NONE,    Encoding_REX2)
 INST5(pop_hide,         "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_TT_NONE,    Encoding_REX2)
 
-INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000040,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit |Encoding_REX2)
-INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | Encoding_REX2)
-INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000048,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | Encoding_REX2)
+INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     0x0000FE,     BAD_CODE,     0x000040,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF)
+INST5(inc_no_evex,      "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     0x0000FE,     BAD_CODE,     0x000040,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     0x0008FE,     BAD_CODE,     0x000048,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF)
+INST5(dec_no_evex,      "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     0x0008FE,     BAD_CODE,     0x000048,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2)
 
 // Multi-byte opcodes without modrm are represented in mixed endian fashion.
 // See comment around quarter way through this file for more information.
 INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_TT_NONE,    Encoding_REX2)
 
 //    id                nm                  um      mr            mi            rm            a4                         tt              flags
-INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST4(add_no_evex,      "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST4(or_no_evex,       "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
 INST4(adc,              "adc",              IUM_RW, 0x000010,     0x001080,     0x000012,     0x000014,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
 INST4(sbb,              "sbb",              IUM_RW, 0x000018,     0x001880,     0x00001A,     0x00001C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST4(and_no_evex,      "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
 // Does not affect the stack tracking in the emitter
 INST4(sub_hide,         "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
 
-INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
 INST4(cmp,              "cmp",              IUM_RD, 0x000038,     0x003880,     0x00003A,     0x00003C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
 INST4(test,             "test",             IUM_RD, 0x000084,     0x0000F6,     0x000084,     0x0000A8,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                                       | INS_FLAGS_Has_Wbit | Encoding_REX2)
 INST4(mov,              "mov",              IUM_WR, 0x000088,     0x0000C6,     0x00008A,     0x0000B0,                  INS_TT_NONE,    INS_FLAGS_Has_Wbit | Encoding_REX2)
@@ -99,25 +104,25 @@ INST3(movsxd,           "movsxd",           IUM_WR, BAD_CODE,     BAD_CODE,
 #endif
 INST3(movzx,            "movzx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00B6,                                INS_TT_NONE,    INS_FLAGS_Has_Wbit | Encoding_REX2)
 
-INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                INS_TT_NONE,    Reads_OF | Encoding_REX2)
-INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                INS_TT_NONE,    Reads_OF | Encoding_REX2)
-INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                INS_TT_NONE,    Reads_CF | Encoding_REX2)
-INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                INS_TT_NONE,    Reads_CF | Encoding_REX2)
-INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                INS_TT_NONE,    Reads_ZF | Encoding_REX2)
-INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                INS_TT_NONE,    Reads_ZF | Encoding_REX2)
-INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                INS_TT_NONE,    Reads_ZF | Reads_CF | Encoding_REX2)
-INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                INS_TT_NONE,    Reads_ZF | Reads_CF | Encoding_REX2)
-INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                INS_TT_NONE,    Reads_SF | Encoding_REX2)
-INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                INS_TT_NONE,    Reads_SF | Encoding_REX2)
-INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                INS_TT_NONE,    Reads_PF | Encoding_REX2)
-INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                INS_TT_NONE,    Reads_PF | Encoding_REX2)
-INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                INS_TT_NONE,    Reads_OF       | Reads_SF | Encoding_REX2)
-INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                INS_TT_NONE,    Reads_OF       | Reads_SF | Encoding_REX2)
-INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF | Encoding_REX2)
-INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF | Encoding_REX2)
+INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                INS_TT_NONE,    Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                INS_TT_NONE,    Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                INS_TT_NONE,    Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                INS_TT_NONE,    Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                INS_TT_NONE,    Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                INS_TT_NONE,    Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                INS_TT_NONE,    Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                INS_TT_NONE,    Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                INS_TT_NONE,    Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                INS_TT_NONE,    Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                INS_TT_NONE,    Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                INS_TT_NONE,    Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                INS_TT_NONE,    Reads_OF       | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                INS_TT_NONE,    Reads_OF       | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
 
 INST3(xchg,             "xchg",             IUM_RW, 0x000086,     BAD_CODE,     0x000086,                                INS_TT_NONE,    INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
+INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NDD | INS_Flags_Has_NF)
 
 //    id                nm                  um      mr            mi            rm                                       tt              flags
 
@@ -125,25 +130,25 @@ INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,
 // as 2-operand instructions with the target register being implicit
 // implicit_reg = op1*op2_icon
 #define INSTMUL INST3
-INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
+INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
 
 #ifdef TARGET_AMD64
 
-INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit)
+INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
 
 #endif // TARGET_AMD64
 
@@ -588,11 +593,11 @@ INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BA
 INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // BMI1
-INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF)    // Logical AND NOT
-INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Undefined_SF | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF)    // Bit Field Extract
-INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Extract Lowest Set Isolated Bit
-INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Resets_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Get Mask Up to Lowest Set Bit
-INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Reset Lowest Set Bit
+INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Logical AND NOT
+INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Undefined_SF | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Bit Field Extract
+INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Extract Lowest Set Isolated Bit
+INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Resets_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Get Mask Up to Lowest Set Bit
+INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Reset Lowest Set Bit
 
 // BMI2
 INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Zero High Bits Starting with Specified Bit Position
@@ -899,35 +904,43 @@ INST3(movbe,            "movbe",            IUM_WR, PCKMVB(0xF1), BAD_CODE,
 // POPCNT
 INST3(popcnt,           "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xB8),                            INS_TT_NONE,    Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Resets_CF | Encoding_REX2)
 
+#if defined(TARGET_AMD64)
+INST3(tzcnt_evex,       "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0000F4,                                INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF | INS_Flags_Has_NF)    // Count the Number of Trailing Zero Bits
+INST3(lzcnt_evex,       "lzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0000F5,                                INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF | INS_Flags_Has_NF)
+INST3(popcnt_evex,      "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     0x000088,                                INS_TT_NONE,    Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Resets_CF | INS_Flags_Has_NF)
+#endif // TARGET_AMD64
+
+INST3(neg,              "neg",              IUM_RW, 0x0018F6,     BAD_CODE,     0x0018F6,                                INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(not,              "not",              IUM_RW, 0x0010F6,     BAD_CODE,     0x0010F6,                                INS_TT_NONE,    INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD)
+
+INST3(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,     0x0000D2,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,     0x0000D0,                                INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,     0x0008D2,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,     0x0008D0,                                INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+
+INST3(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,     0x0010D2,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,     0x0010D0,                                INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit)
+INST3(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,     0x0018D2,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,     0x0018D0,                                INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit)
+INST3(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,     0x0020D2,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,     0x0020D0,                                INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,     0x0028D2,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,     0x0028D0,                                INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,     0x0038D2,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,     0x0038D0,                                INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,     BAD_CODE,                                INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+
 //    id                nm                  um      mr            mi                                                     flags
 INST2(ret,              "ret",              IUM_RD, 0x0000C3,     0x0000C2,                                              INS_TT_NONE,    INS_FLAGS_None)
 INST2(loop,             "loop",             IUM_RD, BAD_CODE,     0x0000E2,                                              INS_TT_NONE,    INS_FLAGS_None)
 INST2(call,             "call",             IUM_RD, 0x0010FF,     0x0000E8,                                              INS_TT_NONE,    Encoding_REX2)
 
-INST2(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
-INST2(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit)
-INST2(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit)
-INST2(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
-
 //    id                nm                  um      mr                                                                   flags
 INST1(r_movsb,          "rep movsb",        IUM_RD, 0x00A4F3,                                                            INS_TT_NONE,    Reads_DF | INS_FLAGS_Has_Wbit)
 INST1(r_movsd,          "rep movsd",        IUM_RD, 0x00A5F3,                                                            INS_TT_NONE,    Reads_DF | INS_FLAGS_Has_Wbit)
@@ -959,15 +972,12 @@ INST1(leave,            "leave",            IUM_RD, 0x0000C9,
 
 INST1(serialize,        "serialize",        IUM_RD, 0x0fe801,                                                            INS_TT_NONE,    INS_FLAGS_None)
 
-INST1(neg,              "neg",              IUM_RW, 0x0018F6,                                                            INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(not,              "not",              IUM_RW, 0x0010F6,                                                            INS_TT_NONE,    INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
 INST1(cwde,             "cwde",             IUM_RD, 0x000098,                                                            INS_TT_NONE,    INS_FLAGS_None)
 INST1(cdq,              "cdq",              IUM_RD, 0x000099,                                                            INS_TT_NONE,    INS_FLAGS_None)
-INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit)
-INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
+INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit | INS_Flags_Has_NF)
+INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
+INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
 
 INST1(sahf,             "sahf",             IUM_RD, 0x00009E,                                                            INS_TT_NONE,    Restore_SF_ZF_AF_PF_CF)
 
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 4c5fc2e8d5328..0f62b7bf762d6 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -369,8 +369,9 @@ RELEASE_CONFIG_INTEGER(EnableMultiRegLocals, "EnableMultiRegLocals", 1)
 RELEASE_CONFIG_INTEGER(JitNoInline, "JitNoInline", 0)
 
 #if defined(DEBUG)
-CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for legacy instructions.
-CONFIG_INTEGER(JitBypassAPXCheck, "JitBypassAPXCheck", 0)         // Bypass APX CPUID check.
+CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for compatible instructions.
+CONFIG_INTEGER(JitStressPromotedEvexEncoding, "JitStressPromotedEvexEncoding", 0) // Enable promoted EVEX encoding for
+                                                                                  // compatible instructions.
 #endif
 
 // clang-format off
@@ -440,6 +441,7 @@ RELEASE_CONFIG_INTEGER(EnableArm64Sve,              "EnableArm64Sve",
 
 RELEASE_CONFIG_INTEGER(EnableEmbeddedBroadcast,     "EnableEmbeddedBroadcast",   1) // Allows embedded broadcasts to be disabled
 RELEASE_CONFIG_INTEGER(EnableEmbeddedMasking,       "EnableEmbeddedMasking",     1) // Allows embedded masking to be disabled
+RELEASE_CONFIG_INTEGER(EnableApxNDD,                "EnableApxNDD",              0) // Allows APX NDD feature to be disabled
 
 // clang-format on