diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7561351315bd4a..350f28f92f44f3 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9531,7 +9531,7 @@ void CodeGen::genAmd64EmitterUnitTestsAvx10v2() theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_16BYTE, REG_XMM0, REG_XMM1); theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1); - theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_eb_er_rd); + theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_32BYTE, REG_XMM0, REG_XMM1, INS_OPTS_EVEX_er_rd); theEmitter->emitIns_R_R(INS_vcvttps2ibs, EA_64BYTE, REG_XMM0, REG_XMM1); theEmitter->emitIns_R_R(INS_vcvttps2iubs, EA_16BYTE, REG_XMM0, REG_XMM1); diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 9efc99b9210c9e..57190961499b66 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -844,9 +844,7 @@ class emitter unsigned _idCustom5 : 1; unsigned _idCustom6 : 1; -#define _idEvexbContext \ - (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \ - */ +#define _idEvexbContext (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, rounding, SAE */ #define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */ #define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */ @@ -1734,10 +1732,21 @@ class emitter return idGetEvexbContext() != 0; } + void idSetEvexBroadcastBit() + { + assert(!idIsEvexbContextSet()); + _idCustom5 = 1; + } + + void idSetEvexCompressedDisplacementBit() + { + assert(_idCustom6 == 0); + _idCustom6 = 1; + } + void idSetEvexbContext(insOpts instOptions) { assert(!idIsEvexbContextSet()); - assert(idGetEvexbContext() == 0); unsigned value = static_cast(instOptions & INS_OPTS_EVEX_b_MASK); _idCustom5 = ((value >> 0) & 1); @@ -2388,7 +2397,7 @@ class emitter ssize_t emitGetInsCIdisp(instrDesc* id) const; unsigned emitGetInsCIargs(instrDesc* id) const; - inline emitAttr emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast = false) const; + inline emitAttr emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) const; // Return the argument count for a direct call "id". int emitGetInsCDinfo(instrDesc* id); @@ -4164,7 +4173,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) else if (tupleType == INS_TT_FULL) { // Embedded broadcast supported, so either loading scalar or full vector - if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) + if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id)) { memSize = GetInputSizeInBytes(id); } @@ -4183,7 +4192,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) { memSize = 16; } - else if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) + else if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id)) { memSize = GetInputSizeInBytes(id); } @@ -4195,7 +4204,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) else if (tupleType == INS_TT_HALF) { // Embedded broadcast supported, so either loading scalar or half vector - if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) + if (!ignoreEmbeddedBroadcast && HasEmbeddedBroadcast(id)) { memSize = GetInputSizeInBytes(id); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b0a65624163f0e..bc3a17fc2734eb 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -341,6 +341,11 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const // some NAOT scenarios and it will already have been recorded // for appropriate usage. + if (IsBMIInstruction(ins) || IsKMOVInstruction(ins)) + { + return UsePromotedEVEXEncoding(); + } + switch (ins) { #if defined(FEATURE_HW_INTRINSICS) @@ -1572,7 +1577,7 @@ insOpts emitter::GetEmbRoundingMode(uint8_t mode) const switch (mode) { case 1: - return INS_OPTS_EVEX_eb_er_rd; + return INS_OPTS_EVEX_er_rd; case 2: return INS_OPTS_EVEX_er_ru; case 3: @@ -1842,7 +1847,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const return true; } - if (HasEmbeddedBroadcast(id) || HasEmbeddedMask(id)) + if (id->idIsEvexbContextSet() || HasEmbeddedMask(id)) { // Requires the EVEX encoding due to embedded functionality return true; @@ -1866,59 +1871,30 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #if defined(DEBUG) if (emitComp->DoJitStressEvexEncoding()) { - if (IsBMIInstruction(ins)) - { - // The Encoding_EVEX on some BMI instructions is tagged due to APX, - // they cannot be stressed with JitStressEvexEncoding. - return false; - } - - if (IsKMOVInstruction(ins)) - { - // KMOV should not be encoded in EVEX when stressing EVEX, as they are supposed to encded in EVEX only - // when APX is available, only stressing EVEX is not enough making the encoding valid. - return false; - } - - // Requires the EVEX encoding due to STRESS mode and no change in semantics - // - // Some instructions, like VCMPEQW return the value in a SIMD register for - // VEX but in a MASK register for EVEX. Such instructions will have already - // returned TRUE if they should have used EVEX due to the HasMaskReg(id) - // check above so we need to still return false here to preserve semantics. - return !HasKMaskRegisterDest(ins); + // Requires the EVEX encoding due to STRESS mode + return true; } +#endif // DEBUG - if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding()) + if (id->idHasMem()) { - // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX. - if (IsKMOVInstruction(ins)) + if ((ins == INS_pslldq) || (ins == INS_psrldq)) { + // The memory operand can only be encoded using the EVEX encoding return true; } - if (IsBMIInstruction(ins)) + if ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0) { - return HasApxNf(ins); - } + assert((ins == INS_pslld) || (ins == INS_psllq) || (ins == INS_psllw) || (ins == INS_psrad) || + (ins == INS_psraw) || (ins == INS_psrld) || (ins == INS_psrlq) || (ins == INS_psrlw)); - return false; - } -#endif // DEBUG - - if ((ins == INS_pslldq) || (ins == INS_psrldq)) - { - // The memory operand can only be encoded using the EVEX encoding - return id->idHasMem(); - } - - if ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0) - { - assert((ins == INS_pslld) || (ins == INS_psllq) || (ins == INS_psllw) || (ins == INS_psrad) || - (ins == INS_psraw) || (ins == INS_psrld) || (ins == INS_psrlq) || (ins == INS_psrlw)); - - // Memory operand with immediate can only be encoded using EVEX - return id->idHasMemAndCns(); + if (id->idHasMemAndCns()) + { + // Memory operand with immediate can only be encoded using EVEX + return true; + } + } } return false; @@ -2183,11 +2159,13 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt if (id->idIsEvexbContextSet()) { - code |= BBIT_IN_BYTE_EVEX_PREFIX; - if (!id->idHasMem()) { + // For non-memory operations, this holds the EVEX.RC bits + // that indicate the rounding mode to use, EVEX.b is implied + unsigned roundingMode = id->idGetEvexbContext(); + if (roundingMode == 1) { // {rd-sae} @@ -2210,10 +2188,16 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt { unreached(); } + + code |= BBIT_IN_BYTE_EVEX_PREFIX; } - else + else if (HasEmbeddedBroadcast(id)) { - assert(id->idGetEvexbContext() == 1); + // For memory operations, the low bit being set indicates + // we are using embedded broadcast, while the upper bit + // being set indicates we are using compressed displacement + + code |= BBIT_IN_BYTE_EVEX_PREFIX; } } @@ -5123,6 +5107,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) // inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp) { + instruction ins = id->idIns(); UNATIVE_OFFSET size = emitInsSize(id, code, /* includeRexPrefixSize */ true); UNATIVE_OFFSET offs; bool offsIsUpperBound = true; @@ -5226,26 +5211,36 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); - // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - bool compressedFitsInByte = false; - TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); - return size + (compressedFitsInByte ? sizeof(char) : sizeof(int)); - } + ssize_t compressedDsp; + bool fitsInByte; + + if (TryEvexCompressDisp8Byte(id, int(offs), &compressedDsp, &fitsInByte)) + { + if (!TakesEvexPrefix(id)) + { + // We mispredicted the adjusted size since we didn't know we'd use the EVEX + // encoding due to comprssed displacement. So we need an additional adjustment + size += emitGetEvexPrefixSize(id) - emitGetVexPrefixSize(id); + } + SetEvexCompressedDisplacement(id); + } - if ((int)offs < 0) + return size + (fitsInByte ? sizeof(char) : sizeof(int)); + } + else if ((int)offs < 0) { // offset is negative return size + ((int(offs) >= SCHAR_MIN) ? sizeof(char) : sizeof(int)); } #ifdef TARGET_AMD64 - // This case arises for localloc frames else { + // This case arises for localloc frames return size + ((offs <= SCHAR_MAX) ? sizeof(char) : sizeof(int)); } -#endif +#endif // TARGET_AMD64 } } } @@ -5271,9 +5266,40 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); + ssize_t compressedDsp; + +#if !FEATURE_FIXED_OUT_ARGS + if (!emitHasFramePtr) + { + // We cannot use compressed displacement because the stack offset estimator + // can be off and the compression is only usable in very precise scenarios + // + // But we can still predict small encoding for VEX encodable instructions + + if (!TakesEvexPrefix(id)) + { +#ifdef TARGET_AMD64 + useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX); +#else + useSmallEncoding = (offs <= size_t(SCHAR_MAX)); +#endif + } + } + else +#endif // FEATURE_FIXED_OUT_ARGS + if (TryEvexCompressDisp8Byte(id, int(offs), &compressedDsp, &useSmallEncoding)) + { + if (!TakesEvexPrefix(id)) + { + // We mispredicted the adjusted size since we didn't know we'd use the EVEX + // encoding due to compressed displacement. So we need an additional adjustment + size += emitGetEvexPrefixSize(id) - emitGetVexPrefixSize(id); + } + SetEvexCompressedDisplacement(id); + } } else { @@ -5285,7 +5311,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, } // If it is ESP based, and the offset is zero, we will not encode the disp part. - if (!EBPbased && offs == 0) + if (!EBPbased && (offs == 0)) { return size; } @@ -5436,11 +5462,13 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) dspInByte = false; // relocs can't be placed in a byte dspIsZero = false; // relocs won't always be zero } - else + else if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + ssize_t compressedDsp; + + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) { - dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); + SetEvexCompressedDisplacement(id); } } @@ -8070,13 +8098,7 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum SetEvexNfIfNeeded(id, instOptions); SetEvexDFVIfNeeded(id, instOptions); SetApxPpxIfNeeded(id, instOptions); - - if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE) - { - // if EVEX.b needs to be set in this path, then it should be embedded rounding. - assert(UseEvexEncoding()); - id->idSetEvexbContext(instOptions); - } + SetEvexEmbRoundIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); UNATIVE_OFFSET sz = emitInsSizeRR(id); @@ -8352,11 +8374,11 @@ void emitter::emitIns_R_R_A( id->idReg1(reg1); id->idReg2(reg2); + emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); + SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); - emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); - UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -8510,12 +8532,7 @@ void emitter::emitIns_R_R_R( id->idReg2(reg1); id->idReg3(reg2); - if ((instOptions & INS_OPTS_EVEX_b_MASK) != 0) - { - // if EVEX.b needs to be set in this path, then it should be embedded rounding. - assert(UseEvexEncoding()); - id->idSetEvexbContext(instOptions); - } + SetEvexEmbRoundIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); SetEvexNdIfNeeded(id, instOptions); SetEvexNfIfNeeded(id, instOptions); @@ -12560,7 +12577,7 @@ void emitter::emitDispInsHex(instrDesc* id, BYTE* code, size_t sz) // void emitter::emitDispEmbBroadcastCount(instrDesc* id) const { - if (!IsEvexEncodableInstruction(id->idIns()) || !id->idIsEvexbContextSet()) + if (!IsEvexEncodableInstruction(id->idIns()) || !HasEmbeddedBroadcast(id)) { return; } @@ -12587,8 +12604,10 @@ void emitter::emitDispEmbRounding(instrDesc* id) const // for ndd case, we don't need to display any thing special. return; } + assert(!id->idHasMem()); unsigned roundingMode = id->idGetEvexbContext(); + if (roundingMode == 1) { printf(" {rd-sae}"); @@ -12934,7 +12953,7 @@ void emitter::emitDispIns( else { attr = id->idOpSize(); - sstr = codeGen->genSizeStr(emitGetMemOpSize(id)); + sstr = codeGen->genSizeStr(emitGetMemOpSize(id, !id->idHasMem())); if (ins == INS_lea) { @@ -14735,22 +14754,36 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) GOT_DSP: - dspIsZero = (dsp == 0); - if (id->idIsDspReloc()) { dspInByte = false; // relocs can't be placed in a byte + dspIsZero = false; // relocs won't always be zero } - else + else if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + ssize_t compressedDsp; + + if (HasCompressedDisplacement(id)) { - dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); + bool isCompressed = TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte); + assert(isCompressed && dspInByte); + dsp = compressedDsp; + } + else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + { + assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); + dspInByte = false; } else { dspInByte = ((signed char)dsp == (ssize_t)dsp); } + dspIsZero = (dsp == 0); + } + else + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + dspIsZero = (dsp == 0); } if (isMoffset) @@ -14884,14 +14917,15 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - // Does the offset fit in a byte? if (dspInByte) { + // This is "[rbp + dsp8]" dst += emitOutputByte(dst, code | 0x45); dst += emitOutputByte(dst, dsp); } else { + // This is "[rbp + dsp32]" dst += emitOutputByte(dst, code | 0x85); dst += emitOutputLong(dst, dsp); @@ -14901,23 +14935,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } } + else if (dspInByte) + { + // This is "[rbp + dsp8]" + dst += emitOutputWord(dst, code | 0x4500); + dst += emitOutputByte(dst, dsp); + } else { - // Does the offset fit in a byte? - if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4500); - dst += emitOutputByte(dst, dsp); - } - else - { - dst += emitOutputWord(dst, code | 0x8500); - dst += emitOutputLong(dst, dsp); + // This is "[rbp + dsp32]" + dst += emitOutputWord(dst, code | 0x8500); + dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } break; @@ -14927,52 +14959,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - // Is the offset 0 or does it at least fit in a byte? if (dspIsZero) { + // This is simply "[rsp]" dst += emitOutputByte(dst, code | 0x04); dst += emitOutputByte(dst, 0x24); } else if (dspInByte) { + // This is "[rsp + dsp8]" dst += emitOutputByte(dst, code | 0x44); dst += emitOutputByte(dst, 0x24); dst += emitOutputByte(dst, dsp); } else { + // This is "[rsp + dsp32]" dst += emitOutputByte(dst, code | 0x84); dst += emitOutputByte(dst, 0x24); dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) { emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } + else if (dspIsZero) + { + // This is simply "[rsp]" + dst += emitOutputWord(dst, code | 0x0400); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + // This is "[rsp + dsp8]" + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); + } else { - // Is the offset 0 or does it at least fit in a byte? - if (dspIsZero) - { - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, 0x24); - } - else if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dsp); - } - else + // This is "[rsp + dsp32]" + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } break; @@ -14985,28 +15020,26 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Put the register in the opcode code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr); - // Is there a displacement? if (dspIsZero) { // This is simply "[reg]" dst += emitOutputByte(dst, code); } + else if (dspInByte) + { + // This is "[reg + dsp8]" + dst += emitOutputByte(dst, code | 0x40); + dst += emitOutputByte(dst, dsp); + } else { - // This is [reg + dsp]" -- does the offset fit in a byte? - if (dspInByte) - { - dst += emitOutputByte(dst, code | 0x40); - dst += emitOutputByte(dst, dsp); - } - else + // This is "[reg + dsp32]" + dst += emitOutputByte(dst, code | 0x80); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - dst += emitOutputByte(dst, code | 0x80); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } @@ -15021,22 +15054,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // This is simply "[reg]" dst += emitOutputWord(dst, code); } + else if (dspInByte) + { + // This is "[reg + dsp8]" + dst += emitOutputWord(dst, code | 0x4000); + dst += emitOutputByte(dst, dsp); + } else { - // This is [reg + dsp]" -- does the offset fit in a byte? - if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4000); - dst += emitOutputByte(dst, dsp); - } - else + // This is "[reg + dsp32]" + dst += emitOutputWord(dst, code | 0x8000); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - dst += emitOutputWord(dst, code | 0x8000); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } @@ -15064,62 +15096,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0] - if (dspIsZero && reg != REG_EBP) + if (dspIsZero && (reg != REG_EBP)) { // The address is "[reg + {2/4/8} * rgx]" dst += emitOutputByte(dst, code | 0x04); dst += emitOutputByte(dst, regByte); } + else if (dspInByte) + { + // The address is "[reg + {2/4/8} * rgx + dsp8]" + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } else { - // The address is "[reg + {2/4/8} * rgx + disp]" - if (dspInByte) - { - dst += emitOutputByte(dst, code | 0x44); - dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); - } - else + // The address is "[reg + {2/4/8} * rgx + dsp32]" + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - dst += emitOutputByte(dst, code | 0x84); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } + else if (dspIsZero && (reg != REG_EBP)) + { + // The address is "[reg + {2/4/8} * rgx]" + dst += emitOutputWord(dst, code | 0x0400); + dst += emitOutputByte(dst, regByte); + } + else if (dspInByte) + { + // The address is "[reg + {2/4/8} * rgx + dsp8]" + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } else { - // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0] - if (dspIsZero && reg != REG_EBP) + // The address is "[reg + {2/4/8} * rgx + dsp32]" + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - // The address is "[reg + {2/4/8} * rgx]" - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, regByte); - } - else - { - // The address is "[reg + {2/4/8} * rgx + disp]" - if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); - } - else - { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } @@ -15160,60 +15185,55 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - if (dspIsZero && reg != REG_EBP) + if (dspIsZero && (reg != REG_EBP)) { // This is [reg+rgx]" dst += emitOutputByte(dst, code | 0x04); dst += emitOutputByte(dst, regByte); } + else if (dspInByte) + { + // This is [reg+rgx+dsp8]" + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } else { - // This is [reg+rgx+dsp]" -- does the offset fit in a byte? - if (dspInByte) + // This is [reg+rgx+dsp32]" + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - dst += emitOutputByte(dst, code | 0x44); - dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); - } - else - { - dst += emitOutputByte(dst, code | 0x84); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } + else if (dspIsZero && (reg != REG_EBP)) + { + // This is [reg+rgx]" + dst += emitOutputWord(dst, code | 0x0400); + dst += emitOutputByte(dst, regByte); + } + else if (dspInByte) + { + // This is [reg+rgx+dsp8]" + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } else { - if (dspIsZero && reg != REG_EBP) - { - // This is [reg+rgx]" - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, regByte); - } - else + // This is [reg+rgx+dsp32]" + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) { - // This is [reg+rgx+dsp]" -- does the offset fit in a byte? - if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); - } - else - { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) - { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); - } - } + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); } } } @@ -15623,13 +15643,32 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) adr = emitComp->lvaFrameAddress(varNum, &EBPbased); dsp = adr + id->idAddr()->iiaLclVar.lvaOffset(); - // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following - // function, to which the remainder of the emitter logic should handle properly. - // TODO-XARCH-AVX512 : embedded broadcast might change this - int dspAsByte = dsp; - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); + ssize_t compressedDsp; + + if (HasCompressedDisplacement(id)) + { + bool isCompressed = TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte); + assert(isCompressed && dspInByte); + dsp = (int)compressedDsp; + } + else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + { +#if FEATURE_FIXED_OUT_ARGS + // TODO-AMD64-CQ: We should be able to accurately predict this when FEATURE_FIXED_OUT_ARGS + // is available. However, there's some nuance in how emitInsSizeSVCalcDisp does things + // compared to emitOutputSV here, so we will miss a few cases today. + // + // assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); +#endif + + dspInByte = false; + } + else + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } } else { @@ -15648,7 +15687,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (dspInByte) { dst += emitOutputByte(dst, code | 0x45); - dst += emitOutputByte(dst, dspAsByte); + dst += emitOutputByte(dst, dsp); } else { @@ -15656,61 +15695,56 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) dst += emitOutputLong(dst, dsp); } } + else if (dspInByte) + { + dst += emitOutputWord(dst, code | 0x4500); + dst += emitOutputByte(dst, dsp); + } else { - if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4500); - dst += emitOutputByte(dst, dspAsByte); - } - else - { - dst += emitOutputWord(dst, code | 0x8500); - dst += emitOutputLong(dst, dsp); - } + dst += emitOutputWord(dst, code | 0x8500); + dst += emitOutputLong(dst, dsp); } } else { - #if !FEATURE_FIXED_OUT_ARGS // Adjust the offset by the amount currently pushed on the CPU stack dsp += emitCurStackLvl; -#endif - // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following - // function, to which the remainder of the emitter logic should handle properly. - // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { - dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); + // We cannot reliably predict the encoding size up front so we shouldn't + // have encountered a scenario marked with compressed displacement. We + // did predict cases that could use the small encoding for VEX scenarios + + assert(!HasCompressedDisplacement(id)); + + if (!TakesEvexPrefix(id)) + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } } else { dspInByte = ((signed char)dsp == (ssize_t)dsp); - if (dspInByte) - { - dspAsByte = dsp; - } } dspIsZero = (dsp == 0); +#endif // !FEATURE_FIXED_OUT_ARGS // Does the offset fit in a byte? if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - if (dspInByte) + if (dspIsZero) { - if (dspIsZero) - { - dst += emitOutputByte(dst, code | 0x04); - dst += emitOutputByte(dst, 0x24); - } - else - { - dst += emitOutputByte(dst, code | 0x44); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dspAsByte); - } + dst += emitOutputByte(dst, code | 0x04); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); } else { @@ -15719,28 +15753,22 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) dst += emitOutputLong(dst, dsp); } } + else if (dspIsZero) + { + dst += emitOutputWord(dst, code | 0x0400); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); + } else { - if (dspInByte) - { - if (dspIsZero) - { - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, 0x24); - } - else - { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dspAsByte); - } - } - else - { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputLong(dst, dsp); - } + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputLong(dst, dsp); } } @@ -16152,7 +16180,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) addr = emitConsBlock + doff; #ifdef DEBUG - int byteSize = EA_SIZE_IN_BYTES(emitGetMemOpSize(id)); + int byteSize = EA_SIZE_IN_BYTES(emitGetMemOpSize(id, /*ignoreEmbeddedBroadcast*/ false)); // Check that the offset is properly aligned (i.e. the ddd in [ddd]) // When SMALL_CODE is set, we only expect 4-byte alignment, otherwise @@ -18026,7 +18054,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Return Value: // size in bytes. // -ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const +ssize_t emitter::GetInputSizeInBytes(const instrDesc* id) const { assert((unsigned)id->idIns() < ArrLen(CodeGenInterface::instInfo)); insFlags inputSize = static_cast((CodeGenInterface::instInfo[id->idIns()] & Input_Mask)); @@ -18052,53 +18080,73 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const // TryEvexCompressDisp8Byte: Do we do compressed displacement encoding for EVEX. // // Arguments: -// id -- Instruction descriptor. -// dsp -- Displacemnt. -// dspInByte[out] - `true` if compressed displacement +// id -- Instruction descriptor. +// dsp -- displacement to try and compress +// compressedDsp -- [out] the compressed displacement on success; otherwise, dsp +// fitsInByte -- [out] true if the displacement fits in a byte; otherwise, false // // Return Value: -// compressed displacement value if dspInByte === TRUE. -// Original dsp otherwise. +// True if the displacement was compressed; otherwise, false // -ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) +bool emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, ssize_t* compressedDsp, bool* fitsInByte) const { - assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)); + instruction ins = id->idIns(); - if (!hasTupleTypeInfo(id->idIns())) + assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(id->idHasMem() && !id->idHasMemGen()); + assert(!id->idIsDspReloc()); + assert(compressedDsp != nullptr); + assert(fitsInByte != nullptr); + + *compressedDsp = dsp; + *fitsInByte = (static_cast(dsp) == dsp); + + if (!hasTupleTypeInfo(ins)) { // After APX, some instructions with APX features will be promoted // to APX-EVEX, we will re-use the existing displacement emitting // path, but for those instructions with no tuple information, // APX-EVEX treat the scaling factor to be 1 constantly. - instruction ins = id->idIns(); - assert(IsApxExtendedEvexInstruction(ins) || IsBMIInstruction(ins)); - *dspInByte = ((signed char)dsp == (ssize_t)dsp); - return dsp; - } - insTupleType tt = insTupleTypeInfo(id->idIns()); - assert(hasTupleTypeInfo(id->idIns())); + assert(IsApxExtendedEvexInstruction(ins) || IsBMIInstruction(ins) || IsKMOVInstruction(ins)); + assert(*compressedDsp == dsp); - // if dsp is 0, no need for all of this - if (dsp == 0) + return *fitsInByte; + } + + if (*fitsInByte) { - *dspInByte = true; - return dsp; + if (!TakesEvexPrefix(id)) + { + // We already fit into a byte and do not otherwise require the EVEX prefix + // which means we can use the VEX encoding instead and be even smaller. + + assert(*compressedDsp == dsp); + return false; + } } + else + { + ssize_t compressedTest = dsp / 64; - // Only handling non-broadcast forms right now - ssize_t vectorLength = EA_SIZE_IN_BYTES(id->idOpSize()); + if (static_cast(compressedTest) != compressedTest) + { + // We are larger than the maximum possible compressed displacement + assert(*compressedDsp == dsp); + return false; + } + } - ssize_t inputSize = GetInputSizeInBytes(id); + insTupleType tt = insTupleTypeInfo(ins); + ssize_t vectorLength = EA_SIZE_IN_BYTES(id->idOpSize()); + ssize_t inputSize = GetInputSizeInBytes(id); ssize_t disp8Compression = 1; if ((tt & INS_TT_MEM128) != 0) { // These instructions can be one of two tuple types, so we need to find the right one - - instruction ins = id->idIns(); - insFormat insFmt = id->idInsFmt(); + insFormat insFmt = id->idInsFmt(); if ((tt & INS_TT_FULL) != 0) { @@ -18137,13 +18185,13 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI } } + bool isEmbBroadcast = HasEmbeddedBroadcast(id); + switch (tt) { case INS_TT_FULL: { - instruction ins = id->idIns(); - assert((inputSize == 4 || inputSize == 8) || IsAVXVNNIINTInstruction(ins)); - if (HasEmbeddedBroadcast(id)) + if (isEmbBroadcast) { // N = input size in bytes disp8Compression = inputSize; @@ -18159,7 +18207,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_HALF: { assert(inputSize == 4); - if (HasEmbeddedBroadcast(id)) + if (isEmbBroadcast) { // N = input size in bytes disp8Compression = inputSize; @@ -18175,12 +18223,14 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_FULL_MEM: { // N = vector length in bytes + assert(!isEmbBroadcast); disp8Compression = vectorLength; break; } case INS_TT_TUPLE1_SCALAR: { + assert(!isEmbBroadcast); disp8Compression = inputSize; break; } @@ -18188,7 +18238,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_TUPLE1_FIXED: { // N = input size in bytes, 32bit and 64bit only - assert(inputSize == 4 || inputSize == 8); + assert(!isEmbBroadcast); + assert((inputSize == 4) || (inputSize == 8)); disp8Compression = inputSize; break; } @@ -18196,7 +18247,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_TUPLE2: { // N = input size in bytes * 2, 32bit and 64bit for 256 bit and 512 bit only - assert((inputSize == 4) || (inputSize == 8 && vectorLength >= 32)); + assert(!isEmbBroadcast); + assert((inputSize == 4) || ((inputSize == 8) && (vectorLength >= 32))); disp8Compression = inputSize * 2; break; } @@ -18204,7 +18256,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_TUPLE4: { // N = input size in bytes * 4, 32bit for 256 bit and 512 bit, 64bit for 512 bit - assert((inputSize == 4 && vectorLength >= 32) || (inputSize == 8 && vectorLength >= 64)); + assert(!isEmbBroadcast); + assert(((inputSize == 4) && (vectorLength >= 32)) || ((inputSize == 8) && (vectorLength >= 64))); disp8Compression = inputSize * 4; break; } @@ -18212,7 +18265,8 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_TUPLE8: { // N = input size in bytes * 8, 32bit for 512 only - assert((inputSize == 4 && vectorLength >= 64)); + assert(!isEmbBroadcast); + assert((inputSize == 4) && (vectorLength >= 64)); disp8Compression = inputSize * 8; break; } @@ -18220,6 +18274,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_HALF_MEM: { // N = vector length in bytes / 2 + assert(!isEmbBroadcast); disp8Compression = vectorLength / 2; break; } @@ -18227,6 +18282,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_QUARTER_MEM: { // N = vector length in bytes / 4 + assert(!isEmbBroadcast); disp8Compression = vectorLength / 4; break; } @@ -18234,6 +18290,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_EIGHTH_MEM: { // N = vector length in bytes / 8 + assert(!isEmbBroadcast); disp8Compression = vectorLength / 8; break; } @@ -18241,6 +18298,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_MEM128: { // N = 16 + assert(!isEmbBroadcast); disp8Compression = 16; break; } @@ -18248,6 +18306,7 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI case INS_TT_MOVDDUP: { // N = vector length in bytes / 2 + assert(!isEmbBroadcast); disp8Compression = (vectorLength == 16) ? (vectorLength / 2) : vectorLength; break; } @@ -18259,23 +18318,26 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI } // If we can evenly divide dsp by the disp8Compression, we can attempt to use it in a disp8 byte form - if (dsp % disp8Compression != 0) + if ((dsp % disp8Compression) != 0) { - *dspInByte = false; - return dsp; + assert(*compressedDsp == dsp); + *fitsInByte = false; + return false; } - ssize_t compressedDsp = dsp / disp8Compression; + ssize_t compressedDisp = dsp / disp8Compression; - *dspInByte = ((signed char)compressedDsp == (ssize_t)compressedDsp); - if (*dspInByte) + if (static_cast(compressedDisp) != compressedDisp) { - return compressedDsp; - } - else - { - return dsp; + assert(*compressedDsp == dsp); + *fitsInByte = false; + return false; } + + *compressedDsp = compressedDisp; + *fitsInByte = true; + + return true; } /***************************************************************************** diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 2074cb5da3ff70..43c55b25727075 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -266,7 +266,7 @@ bool IsExtendedGPReg(regNumber reg) const; // ins - The instruction to check. // // Returns: -// `true` if Evex encoding requires KMAsk support. +// `true` if Evex encoding requires KMask support. // bool HasKMaskRegisterDest(instruction ins) const { @@ -497,17 +497,28 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA // instOptions - emit options void SetEvexBroadcastIfNeeded(instrDesc* id, insOpts instOptions) { - if ((instOptions & INS_OPTS_EVEX_b_MASK) == INS_OPTS_EVEX_eb_er_rd) + assert(id->idHasMem()); + + if ((instOptions & INS_OPTS_EVEX_eb) != INS_OPTS_NONE) { assert(UseEvexEncoding()); - id->idSetEvexbContext(instOptions); - } - else - { - assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0); + id->idSetEvexBroadcastBit(); } } +//------------------------------------------------------------------------ +// SetEvexCompressedDisplacement: set compressed displacement +// +// Arguments: +// id - instruction descriptor +void SetEvexCompressedDisplacement(instrDesc* id) +{ + assert(id->idHasMem()); + + assert(UseEvexEncoding()); + id->idSetEvexCompressedDisplacementBit(); +} + //------------------------------------------------------------------------ // SetEvexEmbMaskIfNeeded: set embedded mask if needed. // @@ -530,6 +541,25 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions) } } +//------------------------------------------------------------------------ +// SetEvexEmbRoundIfNeeded: set embedded round if needed. +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexEmbRoundIfNeeded(instrDesc* id, insOpts instOptions) +{ + assert(!id->idHasMem()); + + if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE) + { + // if EVEX.b needs to be set in this path, then it should be embedded rounding. + assert(UseEvexEncoding()); + id->idSetEvexbContext(instOptions); + } +} + //------------------------------------------------------------------------ // SetEvexNdIfNeeded: set NDD form - new data destination if needed. // @@ -654,7 +684,7 @@ bool hasVexOrEvexPrefix(code_t code) return (hasVexPrefix(code) || hasEvexPrefix(code)); } -ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte); +bool TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, ssize_t* compressedDsp, bool* fitsInByte) const; //------------------------------------------------------------------------ // codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths @@ -672,7 +702,7 @@ bool codeEvexMigrationCheck(code_t code) return hasEvexPrefix(code); } -ssize_t GetInputSizeInBytes(instrDesc* id) const; +ssize_t GetInputSizeInBytes(const instrDesc* id) const; bool containsAVXInstruction = false; bool ContainsAVX() @@ -1289,23 +1319,39 @@ inline bool emitIsUncondJump(instrDesc* jmp) // inline bool HasEmbeddedBroadcast(const instrDesc* id) const { - return id->idIsEvexbContextSet(); + assert(id->idHasMem()); + return (id->idGetEvexbContext() & INS_OPTS_EVEX_eb) != 0; } //------------------------------------------------------------------------ -// HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding. +// HasEmbeddedMask: Do we consider embedded masking while encoding. // // Arguments: // id - Instruction descriptor. // // Returns: -// `true` if the instruction does embedded broadcast. +// `true` if the instruction does embedded masking. // inline bool HasEmbeddedMask(const instrDesc* id) const { return id->idIsEvexAaaContextSet() || id->idIsEvexZContextSet(); } +//------------------------------------------------------------------------ +// HasCompressedDisplacement: Do we consider compressed displacement while encoding. +// +// Arguments: +// id - Instruction descriptor. +// +// Returns: +// `true` if the instruction does compressed displacement. +// +inline bool HasCompressedDisplacement(const instrDesc* id) const +{ + assert(id->idHasMem()); + return (id->idGetEvexbContext() & INS_OPTS_EVEX_cd) != 0; +} + inline bool HasHighSIMDReg(const instrDesc* id) const; inline bool HasExtendedGPReg(const instrDesc* id) const; diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 6d255a1295315c..361e9ae8eee6ed 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -95,7 +95,7 @@ static insOpts AddEmbRoundingMode(insOpts instOptions, int8_t mode) { case 0x01: { - result |= INS_OPTS_EVEX_eb_er_rd; + result |= INS_OPTS_EVEX_er_rd; break; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c336b1c521d03a..5e4a775a49ecd2 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1465,7 +1465,7 @@ insOpts CodeGen::AddEmbBroadcastMode(insOpts instOptions) { assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0); unsigned result = static_cast(instOptions); - return static_cast(result | INS_OPTS_EVEX_eb_er_rd); + return static_cast(result | INS_OPTS_EVEX_eb); } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index d3a5b5385a3e9d..eb081a73bee90a 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -248,13 +248,17 @@ enum insOpts: unsigned INS_OPTS_NONE = 0, // Two-bits: 0b0000_0011 - INS_OPTS_EVEX_b_MASK = 0x03, // mask for EVEX.b related features. + INS_OPTS_EVEX_b_MASK = 0x03, // mask for EVEX.b related features. - INS_OPTS_EVEX_eb_er_rd = 1, // Embedded Broadcast or Round down + INS_OPTS_EVEX_eb = 1, // Embedded broadcast - INS_OPTS_EVEX_er_ru = 2, // Round up + INS_OPTS_EVEX_cd = 2, // Compressed displacement - INS_OPTS_EVEX_er_rz = 3, // Round towards zero + INS_OPTS_EVEX_er_rd = 1, // Embedded round down + + INS_OPTS_EVEX_er_ru = 2, // Embedded round up + + INS_OPTS_EVEX_er_rz = 3, // Embedded round towards zero // Three-bits: 0b0001_1100 INS_OPTS_EVEX_aaa_MASK = 0x1C, // mask for EVEX.aaa related features diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index d395369ffc1b07..088c8b981968be 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -471,13 +471,13 @@ INST3(roundsd, "vroundsd", IUM_WR, BAD_CODE, BAD_CODE, INST3(roundss, "vroundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values // Instructions for AESNI, PCLMULQDQ -INST3(aesdec, "vaesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow -INST3(aesdeclast, "vaesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow -INST3(aesenc, "vaesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow -INST3(aesenclast, "vaesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow +INST3(aesdec, "vaesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow +INST3(aesdeclast, "vaesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow +INST3(aesenc, "vaesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow +INST3(aesenclast, "vaesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow INST3(aesimc, "vaesimc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDB), 8C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Perform the AES InvMixColumn Transformation INST3(aeskeygenassist, "vaeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xDF), 7C, 13C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // AES Round Key Generation Assist -INST3(pclmulqdq, "vpclmulqdq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), 7C, 1C, INS_TT_FULL_MEM, KMask_Base1 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords +INST3(pclmulqdq, "vpclmulqdq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), 7C, 1C, INS_TT_FULL_MEM, KMask_Base1 | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords // Instructions for SHA INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords @@ -613,27 +613,27 @@ INST3(vfnmsub231ss, "vfnmsub231ss", IUM_RW, BAD_CODE, BAD_CODE, #define FIRST_BMI_INSTRUCTION INS_andn // Instructions for BMI1, BMI2 -INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Logical AND NOT -INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), 2C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Bit Field Extract -INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Extract Lowest Set Isolated Bit -INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Get Mask Up to Lowest Set Bit -INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit -INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position -INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), 4C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags -INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 3C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit -INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 3C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract -INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX) -INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags -INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags -INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags +INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Logical AND NOT +INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), 2C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Bit Field Extract +INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Extract Lowest Set Isolated Bit +INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Get Mask Up to Lowest Set Bit +INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit +INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position +INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), 4C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags +INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 3C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit +INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), 3C, 1C, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract +INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX) +INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags +INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags +INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), 1C, 2X, INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags #define LAST_BMI_INSTRUCTION INS_shrx #define FIRST_AVXVNNI_INSTRUCTION INS_vpdpbusd // Instructions for AVXVNNI -INST3(vpdpbusd, "vpdpbusd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x50), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes -INST3(vpdpbusds, "vpdpbusds", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x51), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation -INST3(vpdpwssd, "vpdpwssd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x52), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers -INST3(vpdpwssds, "vpdpwssds", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x53), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation +INST3(vpdpbusd, "vpdpbusd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x50), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes +INST3(vpdpbusds, "vpdpbusds", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x51), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation +INST3(vpdpwssd, "vpdpwssd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x52), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers +INST3(vpdpwssds, "vpdpwssds", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x53), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation #define LAST_AVXVNNI_INSTRUCTION INS_vpdpwssds #define FIRST_AVXVNNIINT8_INSTRUCTION INS_vpdpwsud @@ -656,8 +656,8 @@ INST3(vpdpbuuds, "vpdpbuuds", IUM_WR, BAD_CODE, BAD_ #define FIRST_AVXIFMA_INSTRUCTION INS_vpmadd52huq // Instructions for AVXIFMA -INST3(vpmadd52huq, "vpmadd52huq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB5), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators -INST3(vpmadd52luq, "vpmadd52luq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB4), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators +INST3(vpmadd52huq, "vpmadd52huq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB5), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators +INST3(vpmadd52luq, "vpmadd52luq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB4), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators #define LAST_AVXIFMA_INSTRUCTION INS_vpmadd52luq #define LAST_AVX_INSTRUCTION INS_vpmadd52luq