diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 7561351315bd4a..350f28f92f44f3 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -9531,7 +9531,7 @@ void CodeGen::genAmd64EmitterUnitTestsAvx10v2()
 
     theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_16BYTE, REG_XMM0, REG_XMM1);
     theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1);
-    theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_eb_er_rd);
+    theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_rd);
     theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_64BYTE, REG_XMM0, REG_XMM1);
 
     theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_16BYTE, REG_XMM0, REG_XMM1);
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 9efc99b9210c9e..57190961499b66 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -844,9 +844,7 @@ class emitter
         unsigned _idCustom5 : 1;
         unsigned _idCustom6 : 1;
 
-#define _idEvexbContext                                                                                                \
-    (_idCustom6 << 1) | _idCustom5  /* Evex.b: embedded broadcast, embedded rounding, embedded SAE                     \
-                                     */
+#define _idEvexbContext  (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, rounding, SAE */
 #define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
 #define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */
 
@@ -1734,10 +1732,21 @@ class emitter
             return idGetEvexbContext() != 0;
         }
 
+        void idSetEvexBroadcastBit()
+        {
+            assert(!idIsEvexbContextSet());
+            _idCustom5 = 1;
+        }
+
+        void idSetEvexCompressedDisplacementBit()
+        {
+            assert(_idCustom6 == 0);
+            _idCustom6 = 1;
+        }
+
         void idSetEvexbContext(insOpts instOptions)
         {
             assert(!idIsEvexbContextSet());
-            assert(idGetEvexbContext() == 0);
             unsigned value = static_cast<unsigned>(instOptions & INS_OPTS_EVEX_b_MASK);
 
             _idCustom5 = ((value >> 0) & 1);
@@ -2388,7 +2397,7 @@ class emitter
     ssize_t  emitGetInsCIdisp(instrDesc* id) const;
     unsigned emitGetInsCIargs(instrDesc* id) const;
 
-    inline emitAttr emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast = false) const;
+    inline emitAttr emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) const;
 
     // Return the argument count for a direct call "id".
     int emitGetInsCDinfo(instrDesc* id);
@@ -4164,7 +4173,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast)
     else if (tupleType == INS_TT_FULL)
     {
         // Embedded broadcast supported, so either loading scalar or full vector
-        if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast)
+        if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id))
         {
             memSize = GetInputSizeInBytes(id);
         }
@@ -4183,7 +4192,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast)
         {
             memSize = 16;
         }
-        else if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast)
+        else if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id))
         {
             memSize = GetInputSizeInBytes(id);
         }
@@ -4195,7 +4204,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast)
     else if (tupleType == INS_TT_HALF)
     {
         // Embedded broadcast supported, so either loading scalar or half vector
-        if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast)
+        if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id))
         {
             memSize = GetInputSizeInBytes(id);
         }
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index b0a65624163f0e..bc3a17fc2734eb 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -341,6 +341,11 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const
     // some NAOT scenarios and it will already have been recorded
     // for appropriate usage.
 
+    if (IsBMIInstruction(ins) || IsKMOVInstruction(ins))
+    {
+        return UsePromotedEVEXEncoding();
+    }
+
     switch (ins)
     {
 #if defined(FEATURE_HW_INTRINSICS)
@@ -1572,7 +1577,7 @@ insOpts emitter::GetEmbRoundingMode(uint8_t mode) const
     switch (mode)
     {
         case 1:
-            return INS_OPTS_EVEX_eb_er_rd;
+            return INS_OPTS_EVEX_er_rd;
         case 2:
             return INS_OPTS_EVEX_er_ru;
         case 3:
@@ -1842,7 +1847,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
         return true;
     }
 
-    if (HasEmbeddedBroadcast(id) || HasEmbeddedMask(id))
+    if (id->idIsEvexbContextSet() || HasEmbeddedMask(id))
     {
         // Requires the EVEX encoding due to embedded functionality
         return true;
@@ -1866,59 +1871,30 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
 #if defined(DEBUG)
     if (emitComp->DoJitStressEvexEncoding())
     {
-        if (IsBMIInstruction(ins))
-        {
-            // The Encoding_EVEX on some BMI instructions is tagged due to APX,
-            // they cannot be stressed with JitStressEvexEncoding.
-            return false;
-        }
-
-        if (IsKMOVInstruction(ins))
-        {
-            // KMOV should not be encoded in EVEX when stressing EVEX, as they are supposed to encded in EVEX only
-            // when APX is available, only stressing EVEX is not enough making the encoding valid.
-            return false;
-        }
-
-        // Requires the EVEX encoding due to STRESS mode and no change in semantics
-        //
-        // Some instructions, like VCMPEQW return the value in a SIMD register for
-        // VEX but in a MASK register for EVEX. Such instructions will have already
-        // returned TRUE if they should have used EVEX due to the HasMaskReg(id)
-        // check above so we need to still return false here to preserve semantics.
-        return !HasKMaskRegisterDest(ins);
+        // Requires the EVEX encoding due to STRESS mode
+        return true;
     }
+#endif // DEBUG
 
-    if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding())
+    if (id->idHasMem())
     {
-        // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX.
-        if (IsKMOVInstruction(ins))
+        if ((ins == INS_pslldq) || (ins == INS_psrldq))
         {
+            // The memory operand can only be encoded using the EVEX encoding
             return true;
         }
 
-        if (IsBMIInstruction(ins))
+        if ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0)
         {
-            return HasApxNf(ins);
-        }
+            assert((ins == INS_pslld) || (ins == INS_psllq) || (ins == INS_psllw) || (ins == INS_psrad) ||
+                   (ins == INS_psraw) || (ins == INS_psrld) || (ins == INS_psrlq) || (ins == INS_psrlw));
 
-        return false;
-    }
-#endif // DEBUG
-
-    if ((ins == INS_pslldq) || (ins == INS_psrldq))
-    {
-        // The memory operand can only be encoded using the EVEX encoding
-        return id->idHasMem();
-    }
-
-    if ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0)
-    {
-        assert((ins == INS_pslld) || (ins == INS_psllq) || (ins == INS_psllw) || (ins == INS_psrad) ||
-               (ins == INS_psraw) || (ins == INS_psrld) || (ins == INS_psrlq) || (ins == INS_psrlw));
-
-        // Memory operand with immediate can only be encoded using EVEX
-        return id->idHasMemAndCns();
+            if (id->idHasMemAndCns())
+            {
+                // Memory operand with immediate can only be encoded using EVEX
+                return true;
+            }
+        }
     }
 
     return false;
@@ -2183,11 +2159,13 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt
 
     if (id->idIsEvexbContextSet())
     {
-        code |= BBIT_IN_BYTE_EVEX_PREFIX;
-
         if (!id->idHasMem())
         {
+            // For non-memory operations, this holds the EVEX.RC bits
+            // that indicate the rounding mode to use, EVEX.b is implied
+
             unsigned roundingMode = id->idGetEvexbContext();
+
             if (roundingMode == 1)
             {
                 // {rd-sae}
@@ -2210,10 +2188,16 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt
             {
                 unreached();
             }
+
+            code |= BBIT_IN_BYTE_EVEX_PREFIX;
         }
-        else
+        else if (HasEmbeddedBroadcast(id))
         {
-            assert(id->idGetEvexbContext() == 1);
+            // For memory operations, the low bit being set indicates
+            // we are using embedded broadcast, while the upper bit
+            // being set indicates we are using compressed displacement
+
+            code |= BBIT_IN_BYTE_EVEX_PREFIX;
         }
     }
 
@@ -5123,6 +5107,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 //
 inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp)
 {
+    instruction    ins  = id->idIns();
     UNATIVE_OFFSET size = emitInsSize(id, code, /* includeRexPrefixSize */ true);
     UNATIVE_OFFSET offs;
     bool           offsIsUpperBound = true;
@@ -5226,26 +5211,36 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
 
                 assert(emitComp->lvaTempsHaveLargerOffsetThanVars());
 
-                // Check whether we can use compressed displacement if EVEX.
-                if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+                if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
                 {
-                    bool compressedFitsInByte = false;
-                    TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte);
-                    return size + (compressedFitsInByte ? sizeof(char) : sizeof(int));
-                }
+                    ssize_t compressedDsp;
+                    bool    fitsInByte;
+
+                    if (TryEvexCompressDisp8Byte(id, int(offs), &compressedDsp, &fitsInByte))
+                    {
+                        if (!TakesEvexPrefix(id))
+                        {
+                            // We mispredicted the adjusted size since we didn't know we'd use the EVEX
+                            // encoding due to comprssed displacement. So we need an additional adjustment
+                            size += emitGetEvexPrefixSize(id) - emitGetVexPrefixSize(id);
+                        }
+                        SetEvexCompressedDisplacement(id);
+                    }
 
-                if ((int)offs < 0)
+                    return size + (fitsInByte ? sizeof(char) : sizeof(int));
+                }
+                else if ((int)offs < 0)
                 {
                     // offset is negative
                     return size + ((int(offs) >= SCHAR_MIN) ? sizeof(char) : sizeof(int));
                 }
 #ifdef TARGET_AMD64
-                // This case arises for localloc frames
                 else
                 {
+                    // This case arises for localloc frames
                     return size + ((offs <= SCHAR_MAX) ? sizeof(char) : sizeof(int));
                 }
-#endif
+#endif // TARGET_AMD64
             }
         }
     }
@@ -5271,9 +5266,40 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
 #endif // !FEATURE_FIXED_OUT_ARGS
 
     bool useSmallEncoding = false;
-    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+
+    if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
     {
-        TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding);
+        ssize_t compressedDsp;
+
+#if !FEATURE_FIXED_OUT_ARGS
+        if (!emitHasFramePtr)
+        {
+            // We cannot use compressed displacement because the stack offset estimator
+            // can be off and the compression is only usable in very precise scenarios
+            //
+            // But we can still predict small encoding for VEX encodable instructions
+
+            if (!TakesEvexPrefix(id))
+            {
+#ifdef TARGET_AMD64
+                useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX);
+#else
+                useSmallEncoding = (offs <= size_t(SCHAR_MAX));
+#endif
+            }
+        }
+        else
+#endif // FEATURE_FIXED_OUT_ARGS
+            if (TryEvexCompressDisp8Byte(id, int(offs), &compressedDsp, &useSmallEncoding))
+            {
+                if (!TakesEvexPrefix(id))
+                {
+                    // We mispredicted the adjusted size since we didn't know we'd use the EVEX
+                    // encoding due to compressed displacement. So we need an additional adjustment
+                    size += emitGetEvexPrefixSize(id) - emitGetVexPrefixSize(id);
+                }
+                SetEvexCompressedDisplacement(id);
+            }
     }
     else
     {
@@ -5285,7 +5311,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
     }
 
     // If it is ESP based, and the offset is zero, we will not encode the disp part.
-    if (!EBPbased && offs == 0)
+    if (!EBPbased && (offs == 0))
     {
         return size;
     }
@@ -5436,11 +5462,13 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         dspInByte = false; // relocs can't be placed in a byte
         dspIsZero = false; // relocs won't always be zero
     }
-    else
+    else if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
     {
-        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+        ssize_t compressedDsp;
+
+        if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte))
         {
-            dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
+            SetEvexCompressedDisplacement(id);
         }
     }
 
@@ -8070,13 +8098,7 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
     SetEvexNfIfNeeded(id, instOptions);
     SetEvexDFVIfNeeded(id, instOptions);
     SetApxPpxIfNeeded(id, instOptions);
-
-    if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
-    {
-        // if EVEX.b needs to be set in this path, then it should be embedded rounding.
-        assert(UseEvexEncoding());
-        id->idSetEvexbContext(instOptions);
-    }
+    SetEvexEmbRoundIfNeeded(id, instOptions);
     SetEvexEmbMaskIfNeeded(id, instOptions);
 
     UNATIVE_OFFSET sz = emitInsSizeRR(id);
@@ -8352,11 +8374,11 @@ void emitter::emitIns_R_R_A(
     id->idReg1(reg1);
     id->idReg2(reg2);
 
+    emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins);
+
     SetEvexBroadcastIfNeeded(id, instOptions);
     SetEvexEmbMaskIfNeeded(id, instOptions);
 
-    emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins);
-
     UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
     id->idCodeSize(sz);
 
@@ -8510,12 +8532,7 @@ void emitter::emitIns_R_R_R(
     id->idReg2(reg1);
     id->idReg3(reg2);
 
-    if ((instOptions & INS_OPTS_EVEX_b_MASK) != 0)
-    {
-        // if EVEX.b needs to be set in this path, then it should be embedded rounding.
-        assert(UseEvexEncoding());
-        id->idSetEvexbContext(instOptions);
-    }
+    SetEvexEmbRoundIfNeeded(id, instOptions);
     SetEvexEmbMaskIfNeeded(id, instOptions);
     SetEvexNdIfNeeded(id, instOptions);
     SetEvexNfIfNeeded(id, instOptions);
@@ -12560,7 +12577,7 @@ void emitter::emitDispInsHex(instrDesc* id, BYTE* code, size_t sz)
 //
 void emitter::emitDispEmbBroadcastCount(instrDesc* id) const
 {
-    if (!IsEvexEncodableInstruction(id->idIns()) || !id->idIsEvexbContextSet())
+    if (!IsEvexEncodableInstruction(id->idIns()) || !HasEmbeddedBroadcast(id))
     {
         return;
     }
@@ -12587,8 +12604,10 @@ void emitter::emitDispEmbRounding(instrDesc* id) const
         // for ndd case, we don't need to display any thing special.
         return;
     }
+
     assert(!id->idHasMem());
     unsigned roundingMode = id->idGetEvexbContext();
+
     if (roundingMode == 1)
     {
         printf(" {rd-sae}");
@@ -12934,7 +12953,7 @@ void emitter::emitDispIns(
     else
     {
         attr = id->idOpSize();
-        sstr = codeGen->genSizeStr(emitGetMemOpSize(id));
+        sstr = codeGen->genSizeStr(emitGetMemOpSize(id, !id->idHasMem()));
 
         if (ins == INS_lea)
         {
@@ -14735,22 +14754,36 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
 
 GOT_DSP:
 
-    dspIsZero = (dsp == 0);
-
     if (id->idIsDspReloc())
     {
         dspInByte = false; // relocs can't be placed in a byte
+        dspIsZero = false; // relocs won't always be zero
     }
-    else
+    else if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
     {
-        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+        ssize_t compressedDsp;
+
+        if (HasCompressedDisplacement(id))
         {
-            dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
+            bool isCompressed = TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte);
+            assert(isCompressed && dspInByte);
+            dsp = compressedDsp;
+        }
+        else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+        {
+            assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte));
+            dspInByte = false;
         }
         else
         {
             dspInByte = ((signed char)dsp == (ssize_t)dsp);
         }
+        dspIsZero = (dsp == 0);
+    }
+    else
+    {
+        dspInByte = ((signed char)dsp == (ssize_t)dsp);
+        dspIsZero = (dsp == 0);
     }
 
     if (isMoffset)
@@ -14884,14 +14917,15 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             {
                 if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
                 {
-                    // Does the offset fit in a byte?
                     if (dspInByte)
                     {
+                        // This is "[rbp + dsp8]"
                         dst += emitOutputByte(dst, code | 0x45);
                         dst += emitOutputByte(dst, dsp);
                     }
                     else
                     {
+                        // This is "[rbp + dsp32]"
                         dst += emitOutputByte(dst, code | 0x85);
                         dst += emitOutputLong(dst, dsp);
 
@@ -14901,23 +14935,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                         }
                     }
                 }
+                else if (dspInByte)
+                {
+                    // This is "[rbp + dsp8]"
+                    dst += emitOutputWord(dst, code | 0x4500);
+                    dst += emitOutputByte(dst, dsp);
+                }
                 else
                 {
-                    // Does the offset fit in a byte?
-                    if (dspInByte)
-                    {
-                        dst += emitOutputWord(dst, code | 0x4500);
-                        dst += emitOutputByte(dst, dsp);
-                    }
-                    else
-                    {
-                        dst += emitOutputWord(dst, code | 0x8500);
-                        dst += emitOutputLong(dst, dsp);
+                    // This is "[rbp + dsp32]"
+                    dst += emitOutputWord(dst, code | 0x8500);
+                    dst += emitOutputLong(dst, dsp);
 
-                        if (id->idIsDspReloc())
-                        {
-                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                        }
+                    if (id->idIsDspReloc())
+                    {
+                        emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                     }
                 }
                 break;
@@ -14927,52 +14959,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             {
                 if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
                 {
-                    // Is the offset 0 or does it at least fit in a byte?
                     if (dspIsZero)
                     {
+                        // This is simply "[rsp]"
                         dst += emitOutputByte(dst, code | 0x04);
                         dst += emitOutputByte(dst, 0x24);
                     }
                     else if (dspInByte)
                     {
+                        // This is "[rsp + dsp8]"
                         dst += emitOutputByte(dst, code | 0x44);
                         dst += emitOutputByte(dst, 0x24);
                         dst += emitOutputByte(dst, dsp);
                     }
                     else
                     {
+                        // This is "[rsp + dsp32]"
                         dst += emitOutputByte(dst, code | 0x84);
                         dst += emitOutputByte(dst, 0x24);
                         dst += emitOutputLong(dst, dsp);
+
                         if (id->idIsDspReloc())
                         {
                             emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                         }
                     }
                 }
+                else if (dspIsZero)
+                {
+                    // This is simply "[rsp]"
+                    dst += emitOutputWord(dst, code | 0x0400);
+                    dst += emitOutputByte(dst, 0x24);
+                }
+                else if (dspInByte)
+                {
+                    // This is "[rsp + dsp8]"
+                    dst += emitOutputWord(dst, code | 0x4400);
+                    dst += emitOutputByte(dst, 0x24);
+                    dst += emitOutputByte(dst, dsp);
+                }
                 else
                 {
-                    // Is the offset 0 or does it at least fit in a byte?
-                    if (dspIsZero)
-                    {
-                        dst += emitOutputWord(dst, code | 0x0400);
-                        dst += emitOutputByte(dst, 0x24);
-                    }
-                    else if (dspInByte)
-                    {
-                        dst += emitOutputWord(dst, code | 0x4400);
-                        dst += emitOutputByte(dst, 0x24);
-                        dst += emitOutputByte(dst, dsp);
-                    }
-                    else
+                    // This is "[rsp + dsp32]"
+                    dst += emitOutputWord(dst, code | 0x8400);
+                    dst += emitOutputByte(dst, 0x24);
+                    dst += emitOutputLong(dst, dsp);
+
+                    if (id->idIsDspReloc())
                     {
-                        dst += emitOutputWord(dst, code | 0x8400);
-                        dst += emitOutputByte(dst, 0x24);
-                        dst += emitOutputLong(dst, dsp);
-                        if (id->idIsDspReloc())
-                        {
-                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                        }
+                        emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                     }
                 }
                 break;
@@ -14985,28 +15020,26 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                     // Put the register in the opcode
                     code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr);
 
-                    // Is there a displacement?
                     if (dspIsZero)
                     {
                         // This is simply "[reg]"
                         dst += emitOutputByte(dst, code);
                     }
+                    else if (dspInByte)
+                    {
+                        // This is "[reg + dsp8]"
+                        dst += emitOutputByte(dst, code | 0x40);
+                        dst += emitOutputByte(dst, dsp);
+                    }
                     else
                     {
-                        // This is [reg + dsp]" -- does the offset fit in a byte?
-                        if (dspInByte)
-                        {
-                            dst += emitOutputByte(dst, code | 0x40);
-                            dst += emitOutputByte(dst, dsp);
-                        }
-                        else
+                        // This is "[reg + dsp32]"
+                        dst += emitOutputByte(dst, code | 0x80);
+                        dst += emitOutputLong(dst, dsp);
+
+                        if (id->idIsDspReloc())
                         {
-                            dst += emitOutputByte(dst, code | 0x80);
-                            dst += emitOutputLong(dst, dsp);
-                            if (id->idIsDspReloc())
-                            {
-                                emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                            }
+                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                         }
                     }
                 }
@@ -15021,22 +15054,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                         // This is simply "[reg]"
                         dst += emitOutputWord(dst, code);
                     }
+                    else if (dspInByte)
+                    {
+                        // This is "[reg + dsp8]"
+                        dst += emitOutputWord(dst, code | 0x4000);
+                        dst += emitOutputByte(dst, dsp);
+                    }
                     else
                     {
-                        // This is [reg + dsp]" -- does the offset fit in a byte?
-                        if (dspInByte)
-                        {
-                            dst += emitOutputWord(dst, code | 0x4000);
-                            dst += emitOutputByte(dst, dsp);
-                        }
-                        else
+                        // This is "[reg + dsp32]"
+                        dst += emitOutputWord(dst, code | 0x8000);
+                        dst += emitOutputLong(dst, dsp);
+
+                        if (id->idIsDspReloc())
                         {
-                            dst += emitOutputWord(dst, code | 0x8000);
-                            dst += emitOutputLong(dst, dsp);
-                            if (id->idIsDspReloc())
-                            {
-                                emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                            }
+                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                         }
                     }
                 }
@@ -15064,62 +15096,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
 
                 if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
                 {
-                    // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0]
-                    if (dspIsZero && reg != REG_EBP)
+                    if (dspIsZero && (reg != REG_EBP))
                     {
                         // The address is "[reg + {2/4/8} * rgx]"
                         dst += emitOutputByte(dst, code | 0x04);
                         dst += emitOutputByte(dst, regByte);
                     }
+                    else if (dspInByte)
+                    {
+                        // The address is "[reg + {2/4/8} * rgx + dsp8]"
+                        dst += emitOutputByte(dst, code | 0x44);
+                        dst += emitOutputByte(dst, regByte);
+                        dst += emitOutputByte(dst, dsp);
+                    }
                     else
                     {
-                        // The address is "[reg + {2/4/8} * rgx + disp]"
-                        if (dspInByte)
-                        {
-                            dst += emitOutputByte(dst, code | 0x44);
-                            dst += emitOutputByte(dst, regByte);
-                            dst += emitOutputByte(dst, dsp);
-                        }
-                        else
+                        // The address is "[reg + {2/4/8} * rgx + dsp32]"
+                        dst += emitOutputByte(dst, code | 0x84);
+                        dst += emitOutputByte(dst, regByte);
+                        dst += emitOutputLong(dst, dsp);
+
+                        if (id->idIsDspReloc())
                         {
-                            dst += emitOutputByte(dst, code | 0x84);
-                            dst += emitOutputByte(dst, regByte);
-                            dst += emitOutputLong(dst, dsp);
-                            if (id->idIsDspReloc())
-                            {
-                                emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                            }
+                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                         }
                     }
                 }
+                else if (dspIsZero && (reg != REG_EBP))
+                {
+                    // The address is "[reg + {2/4/8} * rgx]"
+                    dst += emitOutputWord(dst, code | 0x0400);
+                    dst += emitOutputByte(dst, regByte);
+                }
+                else if (dspInByte)
+                {
+                    // The address is "[reg + {2/4/8} * rgx + dsp8]"
+                    dst += emitOutputWord(dst, code | 0x4400);
+                    dst += emitOutputByte(dst, regByte);
+                    dst += emitOutputByte(dst, dsp);
+                }
                 else
                 {
-                    // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0]
-                    if (dspIsZero && reg != REG_EBP)
+                    // The address is "[reg + {2/4/8} * rgx + dsp32]"
+                    dst += emitOutputWord(dst, code | 0x8400);
+                    dst += emitOutputByte(dst, regByte);
+                    dst += emitOutputLong(dst, dsp);
+
+                    if (id->idIsDspReloc())
                     {
-                        // The address is "[reg + {2/4/8} * rgx]"
-                        dst += emitOutputWord(dst, code | 0x0400);
-                        dst += emitOutputByte(dst, regByte);
-                    }
-                    else
-                    {
-                        // The address is "[reg + {2/4/8} * rgx + disp]"
-                        if (dspInByte)
-                        {
-                            dst += emitOutputWord(dst, code | 0x4400);
-                            dst += emitOutputByte(dst, regByte);
-                            dst += emitOutputByte(dst, dsp);
-                        }
-                        else
-                        {
-                            dst += emitOutputWord(dst, code | 0x8400);
-                            dst += emitOutputByte(dst, regByte);
-                            dst += emitOutputLong(dst, dsp);
-                            if (id->idIsDspReloc())
-                            {
-                                emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                            }
-                        }
+                        emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                     }
                 }
             }
@@ -15160,60 +15185,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
 
             if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
             {
-                if (dspIsZero && reg != REG_EBP)
+                if (dspIsZero && (reg != REG_EBP))
                 {
                     // This is [reg+rgx]"
                     dst += emitOutputByte(dst, code | 0x04);
                     dst += emitOutputByte(dst, regByte);
                 }
+                else if (dspInByte)
+                {
+                    // This is [reg+rgx+dsp8]"
+                    dst += emitOutputByte(dst, code | 0x44);
+                    dst += emitOutputByte(dst, regByte);
+                    dst += emitOutputByte(dst, dsp);
+                }
                 else
                 {
-                    // This is [reg+rgx+dsp]" -- does the offset fit in a byte?
-                    if (dspInByte)
+                    // This is [reg+rgx+dsp32]"
+                    dst += emitOutputByte(dst, code | 0x84);
+                    dst += emitOutputByte(dst, regByte);
+                    dst += emitOutputLong(dst, dsp);
+
+                    if (id->idIsDspReloc())
                     {
-                        dst += emitOutputByte(dst, code | 0x44);
-                        dst += emitOutputByte(dst, regByte);
-                        dst += emitOutputByte(dst, dsp);
-                    }
-                    else
-                    {
-                        dst += emitOutputByte(dst, code | 0x84);
-                        dst += emitOutputByte(dst, regByte);
-                        dst += emitOutputLong(dst, dsp);
-                        if (id->idIsDspReloc())
-                        {
-                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                        }
+                        emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                     }
                 }
             }
+            else if (dspIsZero && (reg != REG_EBP))
+            {
+                // This is [reg+rgx]"
+                dst += emitOutputWord(dst, code | 0x0400);
+                dst += emitOutputByte(dst, regByte);
+            }
+            else if (dspInByte)
+            {
+                // This is [reg+rgx+dsp8]"
+                dst += emitOutputWord(dst, code | 0x4400);
+                dst += emitOutputByte(dst, regByte);
+                dst += emitOutputByte(dst, dsp);
+            }
             else
             {
-                if (dspIsZero && reg != REG_EBP)
-                {
-                    // This is [reg+rgx]"
-                    dst += emitOutputWord(dst, code | 0x0400);
-                    dst += emitOutputByte(dst, regByte);
-                }
-                else
+                // This is [reg+rgx+dsp32]"
+                dst += emitOutputWord(dst, code | 0x8400);
+                dst += emitOutputByte(dst, regByte);
+                dst += emitOutputLong(dst, dsp);
+
+                if (id->idIsDspReloc())
                 {
-                    // This is [reg+rgx+dsp]" -- does the offset fit in a byte?
-                    if (dspInByte)
-                    {
-                        dst += emitOutputWord(dst, code | 0x4400);
-                        dst += emitOutputByte(dst, regByte);
-                        dst += emitOutputByte(dst, dsp);
-                    }
-                    else
-                    {
-                        dst += emitOutputWord(dst, code | 0x8400);
-                        dst += emitOutputByte(dst, regByte);
-                        dst += emitOutputLong(dst, dsp);
-                        if (id->idIsDspReloc())
-                        {
-                            emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
-                        }
-                    }
+                    emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW);
                 }
             }
         }
@@ -15623,13 +15643,32 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     adr = emitComp->lvaFrameAddress(varNum, &EBPbased);
     dsp = adr + id->idAddr()->iiaLclVar.lvaOffset();
 
-    // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
-    // function, to which the remainder of the emitter logic should handle properly.
-    // TODO-XARCH-AVX512 : embedded broadcast might change this
-    int dspAsByte = dsp;
-    if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+    if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
     {
-        dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
+        ssize_t compressedDsp;
+
+        if (HasCompressedDisplacement(id))
+        {
+            bool isCompressed = TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte);
+            assert(isCompressed && dspInByte);
+            dsp = (int)compressedDsp;
+        }
+        else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+        {
+#if FEATURE_FIXED_OUT_ARGS
+            // TODO-AMD64-CQ: We should be able to accurately predict this when FEATURE_FIXED_OUT_ARGS
+            // is available. However, there's some nuance in how emitInsSizeSVCalcDisp does things
+            // compared to emitOutputSV here, so we will miss a few cases today.
+            //
+            // assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte));
+#endif
+
+            dspInByte = false;
+        }
+        else
+        {
+            dspInByte = ((signed char)dsp == (ssize_t)dsp);
+        }
     }
     else
     {
@@ -15648,7 +15687,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             if (dspInByte)
             {
                 dst += emitOutputByte(dst, code | 0x45);
-                dst += emitOutputByte(dst, dspAsByte);
+                dst += emitOutputByte(dst, dsp);
             }
             else
             {
@@ -15656,61 +15695,56 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 dst += emitOutputLong(dst, dsp);
             }
         }
+        else if (dspInByte)
+        {
+            dst += emitOutputWord(dst, code | 0x4500);
+            dst += emitOutputByte(dst, dsp);
+        }
         else
         {
-            if (dspInByte)
-            {
-                dst += emitOutputWord(dst, code | 0x4500);
-                dst += emitOutputByte(dst, dspAsByte);
-            }
-            else
-            {
-                dst += emitOutputWord(dst, code | 0x8500);
-                dst += emitOutputLong(dst, dsp);
-            }
+            dst += emitOutputWord(dst, code | 0x8500);
+            dst += emitOutputLong(dst, dsp);
         }
     }
     else
     {
-
 #if !FEATURE_FIXED_OUT_ARGS
         // Adjust the offset by the amount currently pushed on the CPU stack
         dsp += emitCurStackLvl;
-#endif
 
-        // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
-        // function, to which the remainder of the emitter logic should handle properly.
-        // TODO-XARCH-AVX512 : embedded broadcast might change this
-        if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
+        if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
         {
-            dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
+            // We cannot reliably predict the encoding size up front so we shouldn't
+            // have encountered a scenario marked with compressed displacement. We
+            // did predict cases that could use the small encoding for VEX scenarios
+
+            assert(!HasCompressedDisplacement(id));
+
+            if (!TakesEvexPrefix(id))
+            {
+                dspInByte = ((signed char)dsp == (ssize_t)dsp);
+            }
         }
         else
         {
             dspInByte = ((signed char)dsp == (ssize_t)dsp);
-            if (dspInByte)
-            {
-                dspAsByte = dsp;
-            }
         }
         dspIsZero = (dsp == 0);
+#endif // !FEATURE_FIXED_OUT_ARGS
 
         // Does the offset fit in a byte?
         if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
         {
-            if (dspInByte)
+            if (dspIsZero)
             {
-                if (dspIsZero)
-                {
-                    dst += emitOutputByte(dst, code | 0x04);
-                    dst += emitOutputByte(dst, 0x24);
-                }
-                else
-                {
-                    dst += emitOutputByte(dst, code | 0x44);
-                    dst += emitOutputByte(dst, 0x24);
-                    dst += emitOutputByte(dst, dspAsByte);
-                }
+                dst += emitOutputByte(dst, code | 0x04);
+                dst += emitOutputByte(dst, 0x24);
+            }
+            else if (dspInByte)
+            {
+                dst += emitOutputByte(dst, code | 0x44);
+                dst += emitOutputByte(dst, 0x24);
+                dst += emitOutputByte(dst, dsp);
             }
             else
             {
@@ -15719,28 +15753,22 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 dst += emitOutputLong(dst, dsp);
             }
         }
+        else if (dspIsZero)
+        {
+            dst += emitOutputWord(dst, code | 0x0400);
+            dst += emitOutputByte(dst, 0x24);
+        }
+        else if (dspInByte)
+        {
+            dst += emitOutputWord(dst, code | 0x4400);
+            dst += emitOutputByte(dst, 0x24);
+            dst += emitOutputByte(dst, dsp);
+        }
         else
         {
-            if (dspInByte)
-            {
-                if (dspIsZero)
-                {
-                    dst += emitOutputWord(dst, code | 0x0400);
-                    dst += emitOutputByte(dst, 0x24);
-                }
-                else
-                {
-                    dst += emitOutputWord(dst, code | 0x4400);
-                    dst += emitOutputByte(dst, 0x24);
-                    dst += emitOutputByte(dst, dspAsByte);
-                }
-            }
-            else
-            {
-                dst += emitOutputWord(dst, code | 0x8400);
-                dst += emitOutputByte(dst, 0x24);
-                dst += emitOutputLong(dst, dsp);
-            }
+            dst += emitOutputWord(dst, code | 0x8400);
+            dst += emitOutputByte(dst, 0x24);
+            dst += emitOutputLong(dst, dsp);
         }
     }
 
@@ -16152,7 +16180,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         addr = emitConsBlock + doff;
 
 #ifdef DEBUG
-        int byteSize = EA_SIZE_IN_BYTES(emitGetMemOpSize(id));
+        int byteSize = EA_SIZE_IN_BYTES(emitGetMemOpSize(id, /*ignoreEmbeddedBroadcast*/ false));
 
         // Check that the offset is properly aligned (i.e. the ddd in [ddd])
         // When SMALL_CODE is set, we only expect 4-byte alignment, otherwise
@@ -18026,7 +18054,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 // Return Value:
 //    size in bytes.
 //
-ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const
+ssize_t emitter::GetInputSizeInBytes(const instrDesc* id) const
 {
     assert((unsigned)id->idIns() < ArrLen(CodeGenInterface::instInfo));
     insFlags inputSize = static_cast<insFlags>((CodeGenInterface::instInfo[id->idIns()] & Input_Mask));
@@ -18052,53 +18080,73 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const
 // TryEvexCompressDisp8Byte: Do we do compressed displacement encoding for EVEX.
 //
 // Arguments:
-//    id -- Instruction descriptor.
-//    dsp -- Displacemnt.
-//    dspInByte[out] - `true` if compressed displacement
+//    id              -- Instruction descriptor.
+//    dsp             -- displacement to try and compress
+//    compressedDsp   -- [out] the compressed displacement on success; otherwise, dsp
+//    fitsInByte      -- [out] true if the displacement fits in a byte; otherwise, false
 //
 // Return Value:
-//    compressed displacement value if dspInByte ===  TRUE.
-//    Original dsp otherwise.
+//    True if the displacement was compressed; otherwise, false
 //
-ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte)
+bool emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, ssize_t* compressedDsp, bool* fitsInByte) const
 {
-    assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id));
+    instruction ins = id->idIns();
 
-    if (!hasTupleTypeInfo(id->idIns()))
+    assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+    assert(id->idHasMem() && !id->idHasMemGen());
+    assert(!id->idIsDspReloc());
+    assert(compressedDsp != nullptr);
+    assert(fitsInByte != nullptr);
+
+    *compressedDsp = dsp;
+    *fitsInByte    = (static_cast<signed char>(dsp) == dsp);
+
+    if (!hasTupleTypeInfo(ins))
     {
         // After APX, some instructions with APX features will be promoted
         // to APX-EVEX, we will re-use the existing displacement emitting
         // path, but for those instructions with no tuple information,
         // APX-EVEX treat the scaling factor to be 1 constantly.
-        instruction ins = id->idIns();
-        assert(IsApxExtendedEvexInstruction(ins) || IsBMIInstruction(ins));
-        *dspInByte = ((signed char)dsp == (ssize_t)dsp);
-        return dsp;
-    }
 
-    insTupleType tt = insTupleTypeInfo(id->idIns());
-    assert(hasTupleTypeInfo(id->idIns()));
+        assert(IsApxExtendedEvexInstruction(ins) || IsBMIInstruction(ins) || IsKMOVInstruction(ins));
+        assert(*compressedDsp == dsp);
 
-    // if dsp is 0, no need for all of this
-    if (dsp == 0)
+        return *fitsInByte;
+    }
+
+    if (*fitsInByte)
     {
-        *dspInByte = true;
-        return dsp;
+        if (!TakesEvexPrefix(id))
+        {
+            // We already fit into a byte and do not otherwise require the EVEX prefix
+            // which means we can use the VEX encoding instead and be even smaller.
+
+            assert(*compressedDsp == dsp);
+            return false;
+        }
     }
+    else
+    {
+        ssize_t compressedTest = dsp / 64;
 
-    // Only handling non-broadcast forms right now
-    ssize_t vectorLength = EA_SIZE_IN_BYTES(id->idOpSize());
+        if (static_cast<signed char>(compressedTest) != compressedTest)
+        {
+            // We are larger than the maximum possible compressed displacement
+            assert(*compressedDsp == dsp);
+            return false;
+        }
+    }
 
-    ssize_t inputSize = GetInputSizeInBytes(id);
+    insTupleType tt = insTupleTypeInfo(ins);
 
+    ssize_t vectorLength     = EA_SIZE_IN_BYTES(id->idOpSize());
+    ssize_t inputSize        = GetInputSizeInBytes(id);
     ssize_t disp8Compression = 1;
 
     if ((tt & INS_TT_MEM128) != 0)
     {
         // These instructions can be one of two tuple types, so we need to find the right one
-
-        instruction ins    = id->idIns();
-        insFormat   insFmt = id->idInsFmt();
+        insFormat insFmt = id->idInsFmt();
 
         if ((tt & INS_TT_FULL) != 0)
         {
@@ -18137,13 +18185,13 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         }
     }
 
+    bool isEmbBroadcast = HasEmbeddedBroadcast(id);
+
     switch (tt)
     {
         case INS_TT_FULL:
         {
-            instruction ins = id->idIns();
-            assert((inputSize == 4 || inputSize == 8) || IsAVXVNNIINTInstruction(ins));
-            if (HasEmbeddedBroadcast(id))
+            if (isEmbBroadcast)
             {
                 // N = input size in bytes
                 disp8Compression = inputSize;
@@ -18159,7 +18207,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_HALF:
         {
             assert(inputSize == 4);
-            if (HasEmbeddedBroadcast(id))
+            if (isEmbBroadcast)
             {
                 // N = input size in bytes
                 disp8Compression = inputSize;
@@ -18175,12 +18223,14 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_FULL_MEM:
         {
             // N = vector length in bytes
+            assert(!isEmbBroadcast);
             disp8Compression = vectorLength;
             break;
         }
 
         case INS_TT_TUPLE1_SCALAR:
         {
+            assert(!isEmbBroadcast);
             disp8Compression = inputSize;
             break;
         }
@@ -18188,7 +18238,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_TUPLE1_FIXED:
         {
             // N = input size in bytes, 32bit and 64bit only
-            assert(inputSize == 4 || inputSize == 8);
+            assert(!isEmbBroadcast);
+            assert((inputSize == 4) || (inputSize == 8));
             disp8Compression = inputSize;
             break;
         }
@@ -18196,7 +18247,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_TUPLE2:
         {
             // N = input size in bytes * 2, 32bit and 64bit for 256 bit and 512 bit only
-            assert((inputSize == 4) || (inputSize == 8 && vectorLength >= 32));
+            assert(!isEmbBroadcast);
+            assert((inputSize == 4) || ((inputSize == 8) && (vectorLength >= 32)));
             disp8Compression = inputSize * 2;
             break;
         }
@@ -18204,7 +18256,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_TUPLE4:
         {
             // N = input size in bytes * 4, 32bit for 256 bit and 512 bit, 64bit for 512 bit
-            assert((inputSize == 4 && vectorLength >= 32) || (inputSize == 8 && vectorLength >= 64));
+            assert(!isEmbBroadcast);
+            assert(((inputSize == 4) && (vectorLength >= 32)) || ((inputSize == 8) && (vectorLength >= 64)));
             disp8Compression = inputSize * 4;
             break;
         }
@@ -18212,7 +18265,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_TUPLE8:
         {
             // N = input size in bytes * 8, 32bit for 512 only
-            assert((inputSize == 4 && vectorLength >= 64));
+            assert(!isEmbBroadcast);
+            assert((inputSize == 4) && (vectorLength >= 64));
             disp8Compression = inputSize * 8;
             break;
         }
@@ -18220,6 +18274,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_HALF_MEM:
         {
             // N = vector length in bytes / 2
+            assert(!isEmbBroadcast);
             disp8Compression = vectorLength / 2;
             break;
         }
@@ -18227,6 +18282,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_QUARTER_MEM:
         {
             // N = vector length in bytes / 4
+            assert(!isEmbBroadcast);
             disp8Compression = vectorLength / 4;
             break;
         }
@@ -18234,6 +18290,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_EIGHTH_MEM:
         {
             // N = vector length in bytes / 8
+            assert(!isEmbBroadcast);
             disp8Compression = vectorLength / 8;
             break;
         }
@@ -18241,6 +18298,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_MEM128:
         {
             // N = 16
+            assert(!isEmbBroadcast);
             disp8Compression = 16;
             break;
         }
@@ -18248,6 +18306,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
         case INS_TT_MOVDDUP:
         {
             // N = vector length in bytes / 2
+            assert(!isEmbBroadcast);
             disp8Compression = (vectorLength == 16) ? (vectorLength / 2) : vectorLength;
             break;
         }
@@ -18259,23 +18318,26 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
     }
 
     // If we can evenly divide dsp by the disp8Compression, we can attempt to use it in a disp8 byte form
-    if (dsp % disp8Compression != 0)
+    if ((dsp % disp8Compression) != 0)
     {
-        *dspInByte = false;
-        return dsp;
+        assert(*compressedDsp == dsp);
+        *fitsInByte = false;
+        return false;
     }
 
-    ssize_t compressedDsp = dsp / disp8Compression;
+    ssize_t compressedDisp = dsp / disp8Compression;
 
-    *dspInByte = ((signed char)compressedDsp == (ssize_t)compressedDsp);
-    if (*dspInByte)
+    if (static_cast<signed char>(compressedDisp) != compressedDisp)
     {
-        return compressedDsp;
-    }
-    else
-    {
-        return dsp;
+        assert(*compressedDsp == dsp);
+        *fitsInByte = false;
+        return false;
     }
+
+    *compressedDsp = compressedDisp;
+    *fitsInByte    = true;
+
+    return true;
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 2074cb5da3ff70..43c55b25727075 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -266,7 +266,7 @@ bool IsExtendedGPReg(regNumber reg) const;
 //    ins - The instruction to check.
 //
 // Returns:
-//    `true` if Evex encoding requires KMAsk support.
+//    `true` if Evex encoding requires KMask support.
 //
 bool HasKMaskRegisterDest(instruction ins) const
 {
@@ -497,17 +497,28 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA
 //    instOptions - emit options
 void SetEvexBroadcastIfNeeded(instrDesc* id, insOpts instOptions)
 {
-    if ((instOptions & INS_OPTS_EVEX_b_MASK) == INS_OPTS_EVEX_eb_er_rd)
+    assert(id->idHasMem());
+
+    if ((instOptions & INS_OPTS_EVEX_eb) != INS_OPTS_NONE)
     {
         assert(UseEvexEncoding());
-        id->idSetEvexbContext(instOptions);
-    }
-    else
-    {
-        assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0);
+        id->idSetEvexBroadcastBit();
     }
 }
 
+//------------------------------------------------------------------------
+// SetEvexCompressedDisplacement: set compressed displacement
+//
+// Arguments:
+//    id - instruction descriptor
+void SetEvexCompressedDisplacement(instrDesc* id)
+{
+    assert(id->idHasMem());
+
+    assert(UseEvexEncoding());
+    id->idSetEvexCompressedDisplacementBit();
+}
+
 //------------------------------------------------------------------------
 // SetEvexEmbMaskIfNeeded: set embedded mask if needed.
 //
@@ -530,6 +541,25 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions)
     }
 }
 
+//------------------------------------------------------------------------
+// SetEvexEmbRoundIfNeeded: set embedded round if needed.
+//
+// Arguments:
+//    id          - instruction descriptor
+//    instOptions - emit options
+//
+void SetEvexEmbRoundIfNeeded(instrDesc* id, insOpts instOptions)
+{
+    assert(!id->idHasMem());
+
+    if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
+    {
+        // if EVEX.b needs to be set in this path, then it should be embedded rounding.
+        assert(UseEvexEncoding());
+        id->idSetEvexbContext(instOptions);
+    }
+}
+
 //------------------------------------------------------------------------
 // SetEvexNdIfNeeded: set NDD form - new data destination if needed.
 //
@@ -654,7 +684,7 @@ bool hasVexOrEvexPrefix(code_t code)
     return (hasVexPrefix(code) || hasEvexPrefix(code));
 }
 
-ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte);
+bool TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, ssize_t* compressedDsp, bool* fitsInByte) const;
 
 //------------------------------------------------------------------------
 // codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths
@@ -672,7 +702,7 @@ bool codeEvexMigrationCheck(code_t code)
     return hasEvexPrefix(code);
 }
 
-ssize_t GetInputSizeInBytes(instrDesc* id) const;
+ssize_t GetInputSizeInBytes(const instrDesc* id) const;
 
 bool containsAVXInstruction = false;
 bool ContainsAVX()
@@ -1289,23 +1319,39 @@ inline bool emitIsUncondJump(instrDesc* jmp)
 //
 inline bool HasEmbeddedBroadcast(const instrDesc* id) const
 {
-    return id->idIsEvexbContextSet();
+    assert(id->idHasMem());
+    return (id->idGetEvexbContext() & INS_OPTS_EVEX_eb) != 0;
 }
 
 //------------------------------------------------------------------------
-// HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding.
+// HasEmbeddedMask: Do we consider embedded masking while encoding.
 //
 // Arguments:
 //    id - Instruction descriptor.
 //
 // Returns:
-//    `true` if the instruction does embedded broadcast.
+//    `true` if the instruction does embedded masking.
 //
 inline bool HasEmbeddedMask(const instrDesc* id) const
 {
     return id->idIsEvexAaaContextSet() || id->idIsEvexZContextSet();
 }
 
+//------------------------------------------------------------------------
+// HasCompressedDisplacement: Do we consider compressed displacement while encoding.
+//
+// Arguments:
+//    id - Instruction descriptor.
+//
+// Returns:
+//    `true` if the instruction does compressed displacement.
+//
+inline bool HasCompressedDisplacement(const instrDesc* id) const
+{
+    assert(id->idHasMem());
+    return (id->idGetEvexbContext() & INS_OPTS_EVEX_cd) != 0;
+}
+
 inline bool HasHighSIMDReg(const instrDesc* id) const;
 inline bool HasExtendedGPReg(const instrDesc* id) const;
 
diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
index 6d255a1295315c..361e9ae8eee6ed 100644
--- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -95,7 +95,7 @@ static insOpts AddEmbRoundingMode(insOpts instOptions, int8_t mode)
     {
         case 0x01:
         {
-            result |= INS_OPTS_EVEX_eb_er_rd;
+            result |= INS_OPTS_EVEX_er_rd;
             break;
         }
 
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index c336b1c521d03a..5e4a775a49ecd2 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -1465,7 +1465,7 @@ insOpts CodeGen::AddEmbBroadcastMode(insOpts instOptions)
 {
     assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0);
     unsigned result = static_cast<unsigned>(instOptions);
-    return static_cast<insOpts>(result | INS_OPTS_EVEX_eb_er_rd);
+    return static_cast<insOpts>(result | INS_OPTS_EVEX_eb);
 }
 #endif //  TARGET_XARCH && FEATURE_HW_INTRINSICS
 
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index d3a5b5385a3e9d..eb081a73bee90a 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -248,13 +248,17 @@ enum insOpts: unsigned
     INS_OPTS_NONE = 0,
 
     // Two-bits: 0b0000_0011
-    INS_OPTS_EVEX_b_MASK = 0x03,         // mask for EVEX.b related features.
+    INS_OPTS_EVEX_b_MASK = 0x03,    // mask for EVEX.b related features.
 
-    INS_OPTS_EVEX_eb_er_rd = 1,     // Embedded Broadcast or Round down
+    INS_OPTS_EVEX_eb = 1,           // Embedded broadcast
 
-    INS_OPTS_EVEX_er_ru = 2,        // Round up
+    INS_OPTS_EVEX_cd = 2,           // Compressed displacement
 
-    INS_OPTS_EVEX_er_rz = 3,        // Round towards zero
+    INS_OPTS_EVEX_er_rd = 1,        // Embedded round down
+
+    INS_OPTS_EVEX_er_ru = 2,        // Embedded round up
+
+    INS_OPTS_EVEX_er_rz = 3,        // Embedded round towards zero
 
     // Three-bits: 0b0001_1100
     INS_OPTS_EVEX_aaa_MASK = 0x1C,  // mask for EVEX.aaa related features
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index d395369ffc1b07..088c8b981968be 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -471,13 +471,13 @@ INST3(roundsd,          "vroundsd",         IUM_WR, BAD_CODE,     BAD_CODE,
 INST3(roundss,          "vroundss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0A),                             8C,                1C,         INS_TT_TUPLE1_SCALAR,                Input_32Bit    | KMask_Base4     | REX_W0_EVEX  | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Round scalar single precision floating-point values
 
 // Instructions for AESNI, PCLMULQDQ
-INST3(aesdec,           "vaesdec",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform one round of an AES decryption flow
-INST3(aesdeclast,       "vaesdeclast",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform last round of an AES decryption flow
-INST3(aesenc,           "vaesenc",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform one round of an AES encryption flow
-INST3(aesenclast,       "vaesenclast",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform last round of an AES encryption flow
+INST3(aesdec,           "vaesdec",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform one round of an AES decryption flow
+INST3(aesdeclast,       "vaesdeclast",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform last round of an AES decryption flow
+INST3(aesenc,           "vaesenc",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform one round of an AES encryption flow
+INST3(aesenclast,       "vaesenclast",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             4C,                1C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform last round of an AES encryption flow
 INST3(aesimc,           "vaesimc",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDB),                             8C,                2C,         INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX)                                                                                                                                                   // Perform the AES InvMixColumn Transformation
 INST3(aeskeygenassist,  "vaeskeygenassist", IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xDF),                             7C,                13C,        INS_TT_FULL_MEM,                                                        REX_WIG      | Encoding_VEX)                                                                                                                                                   // AES Round Key Generation Assist
-INST3(pclmulqdq,        "vpclmulqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             7C,                1C,         INS_TT_FULL_MEM,                                      KMask_Base1     | REX_WIG      | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform a carry-less multiplication of two quadwords
+INST3(pclmulqdq,        "vpclmulqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             7C,                1C,         INS_TT_FULL_MEM,                                      KMask_Base1     | REX_WIG      | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Perform a carry-less multiplication of two quadwords
 
 // Instructions for SHA
 INST3(sha1msg1,         "sha1msg1",         IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0xC9),                             ILLEGAL,           ILLEGAL,    INS_TT_FULL,                                                            REX_WIG)                                                                                                                                                                       // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords
@@ -613,27 +613,27 @@ INST3(vfnmsub231ss,     "vfnmsub231ss",     IUM_RW, BAD_CODE,     BAD_CODE,
 
 #define FIRST_BMI_INSTRUCTION INS_andn
 // Instructions for BMI1, BMI2
-INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Logical AND NOT
-INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             2C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Undefined_SF | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Bit Field Extract
-INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Extract Lowest Set Isolated Bit
-INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Resets_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Get Mask Up to Lowest Set Bit
-INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Reset Lowest Set Bit
-INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Zero High Bits Starting with Specified Bit Position
-INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             4C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Unsigned Multiply Without Affecting Flags
-INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             3C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Parallel Bits Deposit
-INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             3C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Parallel Bits Extract
-INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX)
-INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF3, 0xF7),                      1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Arithmetic Right Without Affecting Flags
-INST3(shlx,             "shlx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Left Without Affecting Flags
-INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF2, 0xF7),                      1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Right Without Affecting Flags
+INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Logical AND NOT
+INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             2C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Undefined_SF | Writes_ZF    | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF)    // Bit Field Extract
+INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Extract Lowest Set Isolated Bit
+INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Resets_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Get Mask Up to Lowest Set Bit
+INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)    // Reset Lowest Set Bit
+INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF    | Writes_SF    | Writes_ZF    | Undefined_AF | Undefined_PF | Writes_CF)    // Zero High Bits Starting with Specified Bit Position
+INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             4C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Unsigned Multiply Without Affecting Flags
+INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             3C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Parallel Bits Deposit
+INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             3C,                1C,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Parallel Bits Extract
+INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX)
+INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF3, 0xF7),                      1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Arithmetic Right Without Affecting Flags
+INST3(shlx,             "shlx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Left Without Affecting Flags
+INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF2, 0xF7),                      1C,                2X,         INS_TT_NONE,                                                            REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Right Without Affecting Flags
 #define LAST_BMI_INSTRUCTION INS_shrx
 
 #define FIRST_AVXVNNI_INSTRUCTION INS_vpdpbusd
 // Instructions for AVXVNNI
-INST3(vpdpbusd,         "vpdpbusd",         IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x50),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Unsigned and Signed Bytes
-INST3(vpdpbusds,        "vpdpbusds",        IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x51),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Unsigned and Signed Bytes with Saturation
-INST3(vpdpwssd,         "vpdpwssd",         IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x52),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Signed Word Integers
-INST3(vpdpwssds,        "vpdpwssds",        IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x53),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Signed Word Integers with Saturation
+INST3(vpdpbusd,         "vpdpbusd",         IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x50),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Unsigned and Signed Bytes
+INST3(vpdpbusds,        "vpdpbusds",        IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x51),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Unsigned and Signed Bytes with Saturation
+INST3(vpdpwssd,         "vpdpwssd",         IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x52),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Signed Word Integers
+INST3(vpdpwssds,        "vpdpwssds",        IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0x53),                             5C,                2X,         INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Multiply and Add Signed Word Integers with Saturation
 #define LAST_AVXVNNI_INSTRUCTION INS_vpdpwssds
 
 #define FIRST_AVXVNNIINT8_INSTRUCTION INS_vpdpwsud
@@ -656,8 +656,8 @@ INST3(vpdpbuuds,        "vpdpbuuds",        IUM_WR, BAD_CODE,               BAD_
 
 #define FIRST_AVXIFMA_INSTRUCTION INS_vpmadd52huq
 // Instructions for AVXIFMA
-INST3(vpmadd52huq,      "vpmadd52huq",      IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0xB5),                             ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_64Bit    | KMask_Base2     | REX_W1       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators
-INST3(vpmadd52luq,      "vpmadd52luq",      IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0xB4),                             ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_64Bit    | KMask_Base2     | REX_W1       | Encoding_VEX  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators
+INST3(vpmadd52huq,      "vpmadd52huq",      IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0xB5),                             ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_64Bit    | KMask_Base2     | REX_W1                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators
+INST3(vpmadd52luq,      "vpmadd52luq",      IUM_RW, BAD_CODE,     BAD_CODE,     SSE38(0xB4),                             ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_64Bit    | KMask_Base2     | REX_W1                                        | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators
 #define LAST_AVXIFMA_INSTRUCTION INS_vpmadd52luq
 #define LAST_AVX_INSTRUCTION INS_vpmadd52luq