diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 28450f313219b..ea50cb24b231b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -23322,7 +23322,7 @@ GenTree* Compiler::gtNewSimdShuffleNode( #if defined(TARGET_XARCH) uint8_t control = 0; bool crossLane = false; - bool needsZero = varTypeIsSmallInt(simdBaseType) && (simdSize != 64); + bool needsZero = varTypeIsSmallInt(simdBaseType) && (simdSize <= 16); uint64_t value = 0; simd_t vecCns = {}; simd_t mskCns = {}; @@ -23395,7 +23395,8 @@ GenTree* Compiler::gtNewSimdShuffleNode( { assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); - if (varTypeIsSmallInt(simdBaseType)) + if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) || + (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL))) { if (crossLane) { @@ -23448,6 +23449,31 @@ GenTree* Compiler::gtNewSimdShuffleNode( // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX2_PermuteVar8x32, simdBaseJitType, simdSize); } + else if (elementSize == 2) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512BW_VL)); + for (uint32_t i = 0; i < elementCount; i++) + { + vecCns.u16[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize); + } + + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = + gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512BW_VL_PermuteVar16x16, simdBaseJitType, simdSize); + } + else if (elementSize == 1) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512VBMI_VL)); + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = + gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_VL_PermuteVar32x8, simdBaseJitType, simdSize); + } else { assert(elementSize == 8); @@ -23458,6 +23484,7 @@ GenTree* Compiler::gtNewSimdShuffleNode( } else if (simdSize == 64) { + assert(IsBaselineVector512IsaSupportedDebugOnly()); if (elementSize == 4) { for (uint32_t i = 0; i < elementCount; i++) @@ -23484,6 +23511,15 @@ GenTree* Compiler::gtNewSimdShuffleNode( // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512BW_PermuteVar32x16, simdBaseJitType, simdSize); } + else if (elementSize == 1) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512VBMI)); + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_PermuteVar64x8, simdBaseJitType, simdSize); + } else { assert(elementSize == 8); diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index dadabaddbb949..1eeacdfc2bbbf 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2620,13 +2620,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (simdSize == 32) { - if (!compExactlyDependsOn(InstructionSet_AVX2)) + if (!compOpportunisticallyDependsOn(InstructionSet_AVX2)) { // While we could accelerate some functions on hardware with only AVX support // it's likely not worth it overall given that IsHardwareAccelerated reports false break; } - else if (varTypeIsSmallInt(simdBaseType)) + else if ((varTypeIsByte(simdBaseType) && + !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) || + (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL))) { bool crossLane = false; @@ -2663,7 +2665,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } else if (simdSize == 64) { - if (varTypeIsByte(simdBaseType)) + if (varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI)) { // TYP_BYTE, TYP_UBYTE need AVX512VBMI. break; @@ -2673,7 +2675,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(simdSize == 16); - if (varTypeIsSmallInt(simdBaseType) && !compExactlyDependsOn(InstructionSet_SSSE3)) + if (varTypeIsSmallInt(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSSE3)) { // TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation break;