From a1e87231a109577d1f59b9a2b9ba042d064a8202 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 15 Jul 2025 10:36:55 -0700 Subject: [PATCH 1/4] Carry ExtractMostSignificantBits through to LIR and add constant folding support --- src/coreclr/jit/gentree.cpp | 70 ++++++ src/coreclr/jit/hwintrinsicarm64.cpp | 162 +----------- src/coreclr/jit/hwintrinsiclistarm64.h | 4 +- src/coreclr/jit/hwintrinsiclistxarch.h | 4 +- src/coreclr/jit/hwintrinsicxarch.cpp | 68 +---- src/coreclr/jit/rationalize.cpp | 335 +++++++++++++++++++++++++ src/coreclr/jit/rationalize.h | 2 + src/coreclr/jit/simd.h | 62 +++++ src/coreclr/jit/valuenum.cpp | 74 ++++++ 9 files changed, 559 insertions(+), 222 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index bd79d25b39cc1d..9324401acd4149 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32593,6 +32593,76 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) { switch (ni) { +#if defined(TARGET_ARM64) + case NI_Vector64_ExtractMostSignificantBits: +#elif defined(TARGET_XARCH) + case NI_Vector256_ExtractMostSignificantBits: + case NI_X86Base_MoveMask: + case NI_AVX_MoveMask: + case NI_AVX2_MoveMask: +#endif + case NI_Vector128_ExtractMostSignificantBits: + { + simdmask_t simdMaskVal; + + switch (simdSize) + { + case 8: + { + EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd8Val); + break; + } + + case 16: + { + EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd16Val); + break; + } + +#if defined(TARGET_XARCH) + case 32: + { + EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd32Val); + break; + } + + case 64: + { + EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd64Val); + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } + + uint64_t mask; + memcpy(&mask, &simdMaskVal.u64[0], sizeof(uint64_t)); + + uint32_t elemCount = simdSize / genTypeSize(simdBaseType); + uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + + resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + break; + } + +#ifdef TARGET_XARCH + case NI_AVX512_MoveMask: + { + uint64_t mask; + memcpy(&mask, &cnsNode->AsMskCon()->gtSimdMaskVal.u64[0], sizeof(uint64_t)); + + uint32_t elemCount = simdSize / genTypeSize(simdBaseType); + uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + + resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + break; + } +#endif // TARGET_XARCH + #ifdef TARGET_ARM64 case NI_ArmBase_LeadingZeroCount: #else diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index f636ca7c0b0c57..8f7d979fdc1dd3 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -1346,166 +1346,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ExtractMostSignificantBits: { assert(sig->numArgs == 1); - - // ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead. - // To do this, we effectively perform the following steps: - // 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit - // 2. tmp = tmp >> index ; right shift each element by its index - // 3. tmp = sum(tmp) ; sum the elements together - - // For byte/sbyte, we also need to handle the fact that we can only shift by up to 8 - // but for Vector128, we have 16 elements to handle. In that scenario, we will simply - // extract both scalars, and combine them via: (upper << 8) | lower - - var_types simdType = getSIMDTypeForSize(simdSize); - - op1 = impSIMDPopStack(); - - GenTreeVecCon* vecCon2 = gtNewVconNode(simdType); - GenTreeVecCon* vecCon3 = gtNewVconNode(simdType); - - switch (simdBaseType) - { - case TYP_BYTE: - case TYP_UBYTE: - { - simdBaseType = TYP_UBYTE; - simdBaseJitType = CORINFO_TYPE_UBYTE; - - vecCon2->gtSimdVal.u64[0] = 0x8080808080808080; - vecCon3->gtSimdVal.u64[0] = 0x00FFFEFDFCFBFAF9; - - if (simdSize == 16) - { - vecCon2->gtSimdVal.u64[1] = 0x8080808080808080; - vecCon3->gtSimdVal.u64[1] = 0x00FFFEFDFCFBFAF9; - } - break; - } - - case TYP_SHORT: - case TYP_USHORT: - { - simdBaseType = TYP_USHORT; - simdBaseJitType = CORINFO_TYPE_USHORT; - - vecCon2->gtSimdVal.u64[0] = 0x8000800080008000; - vecCon3->gtSimdVal.u64[0] = 0xFFF4FFF3FFF2FFF1; - - if (simdSize == 16) - { - vecCon2->gtSimdVal.u64[1] = 0x8000800080008000; - vecCon3->gtSimdVal.u64[1] = 0xFFF8FFF7FFF6FFF5; - } - break; - } - - case TYP_INT: - case TYP_UINT: - case TYP_FLOAT: - { - simdBaseType = TYP_INT; - simdBaseJitType = CORINFO_TYPE_INT; - - vecCon2->gtSimdVal.u64[0] = 0x8000000080000000; - vecCon3->gtSimdVal.u64[0] = 0xFFFFFFE2FFFFFFE1; - - if (simdSize == 16) - { - vecCon2->gtSimdVal.u64[1] = 0x8000000080000000; - vecCon3->gtSimdVal.u64[1] = 0xFFFFFFE4FFFFFFE3; - } - break; - } - - case TYP_LONG: - case TYP_ULONG: - case TYP_DOUBLE: - { - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; - - vecCon2->gtSimdVal.u64[0] = 0x8000000000000000; - vecCon3->gtSimdVal.u64[0] = 0xFFFFFFFFFFFFFFC1; - - if (simdSize == 16) - { - vecCon2->gtSimdVal.u64[1] = 0x8000000000000000; - vecCon3->gtSimdVal.u64[1] = 0xFFFFFFFFFFFFFFC2; - } - break; - } - - default: - { - unreached(); - } - } - - op3 = vecCon3; - op2 = vecCon2; - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_AdvSimd_And, simdBaseJitType, simdSize); - - NamedIntrinsic shiftIntrinsic = NI_AdvSimd_ShiftLogical; - - if ((simdSize == 8) && varTypeIsLong(simdBaseType)) - { - shiftIntrinsic = NI_AdvSimd_ShiftLogicalScalar; - } - - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op3, shiftIntrinsic, simdBaseJitType, simdSize); - - if (varTypeIsByte(simdBaseType) && (simdSize == 16)) - { - op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); - - op1 = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize); - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8); - op1 = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8); - op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, TYP_INT); - - GenTree* zero = gtNewZeroConNode(TYP_SIMD16); - ssize_t index = 8 / genTypeSize(simdBaseType); - - op2 = gtNewSimdGetUpperNode(TYP_SIMD8, op2, simdBaseJitType, simdSize); - op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8); - op2 = gtNewSimdToScalarNode(genActualType(simdBaseType), op2, simdBaseJitType, 8); - op2 = gtNewCastNode(TYP_INT, op2, /* isUnsigned */ true, TYP_INT); - - op2 = gtNewOperNode(GT_LSH, TYP_INT, op2, gtNewIconNode(8)); - retNode = gtNewOperNode(GT_OR, TYP_INT, op1, op2); - } - else - { - if (!varTypeIsLong(simdBaseType)) - { - if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT))) - { - op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseJitType, - simdSize); - } - else - { - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, - simdSize); - } - } - else if (simdSize == 16) - { - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType, - simdSize); - } - - retNode = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8); - - if ((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT)) - { - retNode = gtNewCastNode(TYP_INT, retNode, /* isUnsigned */ true, TYP_INT); - } - } + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); break; } diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 25433249351660..ec721f567c50f3 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -51,7 +51,7 @@ HARDWARE_INTRINSIC(Vector64, CreateSequence, HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector64, Equals, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector64, EqualsAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Floor, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector64, FusedMultiplyAdd, 8, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment) @@ -182,7 +182,7 @@ HARDWARE_INTRINSIC(Vector128, CreateSequence, HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, FusedMultiplyAdd, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 7786d481a11c12..be680becff0b52 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -69,7 +69,7 @@ HARDWARE_INTRINSIC(Vector128, CreateSequence, HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, FusedMultiplyAdd, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -197,7 +197,7 @@ HARDWARE_INTRINSIC(Vector256, CreateSequence, HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Equals, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, FusedMultiplyAdd, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index a333b87bbffc23..c75ff489b211da 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2432,64 +2432,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case TYP_SHORT: case TYP_USHORT: { - simd_t simdVal = {}; - - assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); - simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; - - // We want to tightly pack the most significant byte of each short/ushort - // and then zero the tightly packed least significant bytes - // - // The most significant bit being set means zero the value - - simdVal.u64[0] = 0x0F0D0B0907050301; - simdVal.u64[1] = 0x8080808080808080; - - if (simdSize == 32) - { - // Vector256 works on 2x128-bit lanes, so repeat the same indices for the upper lane - - simdVal.u64[2] = 0x0F0D0B0907050301; - simdVal.u64[3] = 0x8080808080808080; - - shuffleIntrinsic = NI_AVX2_Shuffle; - moveMaskIntrinsic = NI_X86Base_MoveMask; - } - else if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - shuffleIntrinsic = NI_SSE42_Shuffle; - moveMaskIntrinsic = NI_X86Base_MoveMask; - } - else - { - return nullptr; - } - - op2 = gtNewVconNode(simdType); - memcpy(&op2->AsVecCon()->gtSimdVal, &simdVal, simdSize); - - op1 = impSIMDPopStack(); - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, shuffleIntrinsic, simdBaseJitType, simdSize); - - if (simdSize == 32) - { - CorInfoType simdOtherJitType; - - // Since Vector256 is 2x128-bit lanes we need a full width permutation so we get the lower - // 64-bits of each lane next to eachother. The upper bits should be zero, but also don't - // matter so we can also then simplify down to a 128-bit move mask. - - simdOtherJitType = (simdBaseType == TYP_UBYTE) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; - - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode(0xD8), NI_AVX2_Permute4x64, - simdOtherJitType, simdSize); - - simdType = TYP_SIMD16; - - op1 = gtNewSimdGetLowerNode(simdType, op1, simdBaseJitType, simdSize); - - simdSize = 16; - } + op1 = impSIMDPopStack(); + moveMaskIntrinsic = intrinsic; break; } @@ -2523,6 +2467,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(op1 != nullptr); retNode = gtNewSimdHWIntrinsicNode(retType, op1, moveMaskIntrinsic, simdBaseJitType, simdSize); + + if ((simdSize == 16) && varTypeIsShort(simdBaseType)) + { + if (!compOpportunisticallyDependsOn(InstructionSet_SSE42)) + { + retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint)); + } + } } break; } diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 3f1b55136d00f8..fd1c2a774aa4d9 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -418,6 +418,20 @@ void Rationalizer::RewriteHWIntrinsicAsUserCall(GenTree** use, ArrayStackcompOpportunisticallyDependsOn(InstructionSet_SSE42)) + { + // We want to keep this as is, because we'll rewrite it in post-order + return; + } + break; + } +#endif // TARGET_XARCH + default: { if (sigInfo.numArgs == 0) @@ -612,6 +626,17 @@ void Rationalizer::RewriteHWIntrinsic(GenTree** use, Compiler::GenTreeStack& par } #endif // TARGET_XARCH +#if defined(TARGET_ARM64) + case NI_Vector64_ExtractMostSignificantBits: +#elif defined(TARGET_XARCH) + case NI_Vector256_ExtractMostSignificantBits: +#endif + case NI_Vector128_ExtractMostSignificantBits: + { + RewriteHWIntrinsicExtractMsb(use, parents); + break; + } + default: { break; @@ -1305,6 +1330,316 @@ bool Rationalizer::ShouldRewriteToNonMaskHWIntrinsic(GenTree* node) return false; } #endif // TARGET_XARCH + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicExtractMsb: Rewrites a hwintrinsic ExtractMostSignificantBytes operation +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + GenTree* op1 = node->Op(1); + +#if defined(TARGET_ARM64) + // ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead. + // To do this, we effectively perform the following steps: + // 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit + // 2. tmp = tmp >> index ; right shift each element by its index + // 3. tmp = sum(tmp) ; sum the elements together + + GenTreeVecCon* vecCon2 = comp->gtNewVconNode(simdType); + GenTreeVecCon* vecCon3 = comp->gtNewVconNode(simdType); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + simdBaseType = TYP_UBYTE; + simdBaseJitType = CORINFO_TYPE_UBYTE; + + vecCon2->gtSimdVal.u64[0] = 0x8080808080808080; + vecCon3->gtSimdVal.u64[0] = 0x00FFFEFDFCFBFAF9; + + if (simdSize == 16) + { + vecCon2->gtSimdVal.u64[1] = 0x8080808080808080; + vecCon3->gtSimdVal.u64[1] = 0x00FFFEFDFCFBFAF9; + } + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + simdBaseType = TYP_USHORT; + simdBaseJitType = CORINFO_TYPE_USHORT; + + vecCon2->gtSimdVal.u64[0] = 0x8000800080008000; + vecCon3->gtSimdVal.u64[0] = 0xFFF4FFF3FFF2FFF1; + + if (simdSize == 16) + { + vecCon2->gtSimdVal.u64[1] = 0x8000800080008000; + vecCon3->gtSimdVal.u64[1] = 0xFFF8FFF7FFF6FFF5; + } + break; + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + simdBaseType = TYP_INT; + simdBaseJitType = CORINFO_TYPE_INT; + + vecCon2->gtSimdVal.u64[0] = 0x8000000080000000; + vecCon3->gtSimdVal.u64[0] = 0xFFFFFFE2FFFFFFE1; + + if (simdSize == 16) + { + vecCon2->gtSimdVal.u64[1] = 0x8000000080000000; + vecCon3->gtSimdVal.u64[1] = 0xFFFFFFE4FFFFFFE3; + } + break; + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; + + vecCon2->gtSimdVal.u64[0] = 0x8000000000000000; + vecCon3->gtSimdVal.u64[0] = 0xFFFFFFFFFFFFFFC1; + + if (simdSize == 16) + { + vecCon2->gtSimdVal.u64[1] = 0x8000000000000000; + vecCon3->gtSimdVal.u64[1] = 0xFFFFFFFFFFFFFFC2; + } + break; + } + + default: + { + unreached(); + } + } + + BlockRange().InsertAfter(op1, vecCon2); + GenTree* tmp = comp->gtNewSimdBinOpNode(GT_AND, simdType, op1, vecCon2, simdBaseJitType, simdSize); + BlockRange().InsertAfter(vecCon2, tmp); + op1 = tmp; + + if ((simdSize == 8) && varTypeIsLong(simdBaseType)) + { + intrinsic = NI_AdvSimd_ShiftLogicalScalar; + } + else + { + intrinsic = NI_AdvSimd_ShiftLogical; + } + + BlockRange().InsertAfter(op1, vecCon3); + tmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, vecCon3, intrinsic, simdBaseJitType, simdSize); + BlockRange().InsertAfter(vecCon3, tmp); + op1 = tmp; + + if (varTypeIsByte(simdBaseType) && (simdSize == 16)) + { + // For byte/sbyte, we also need to handle the fact that we can only shift by up to 8 + // but for Vector128, we have 16 elements to handle. In that scenario, we will widen + // to ushort and combine the lower/upper halves. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + op1Use.ReplaceWithLclVar(comp); + op1 = op1Use.Def(); + + GenTree* op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + tmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_ZeroExtendWideningUpper, simdBaseJitType, 16); + BlockRange().InsertBefore(op2, tmp); + op1 = tmp; + + GenTree* icon = comp->gtNewIconNode(8); + BlockRange().InsertBefore(op2, icon); + + tmp = comp->gtNewSimdBinOpNode(GT_LSH, simdType, op1, icon, CORINFO_TYPE_USHORT, simdSize); + BlockRange().InsertBefore(op2, tmp); + op1 = tmp; + + tmp = comp->gtNewSimdGetLowerNode(TYP_SIMD8, op2, simdBaseJitType, 16); + BlockRange().InsertAfter(op2, tmp); + op2 = tmp; + + tmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_AdvSimd_AddWideningLower, simdBaseJitType, 8); + BlockRange().InsertAfter(op2, tmp); + op1 = tmp; + + simdBaseType = TYP_USHORT; + simdBaseJitType = CORINFO_TYPE_USHORT; + } + + // Sum the elements + + if (!varTypeIsLong(simdBaseType)) + { + if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT))) + { + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + op1Use.ReplaceWithLclVar(comp); + op1 = op1Use.Def(); + + GenTree* op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + tmp = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseJitType, simdSize); + BlockRange().InsertAfter(op2, tmp); + op1 = tmp; + } + else + { + tmp = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, simdSize); + BlockRange().InsertAfter(op1, tmp); + op1 = tmp; + } + } + else if (simdSize == 16) + { + tmp = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType, + simdSize); + BlockRange().InsertAfter(op1, tmp); + op1 = tmp; + } + + if (simdSize == 8) + { + intrinsic = NI_Vector64_ToScalar; + } + else + { + intrinsic = NI_Vector128_ToScalar; + } + + node->gtType = genActualType(simdBaseType); + node->ChangeHWIntrinsicId(intrinsic); + node->SetSimdSize(8); + node->SetSimdBaseJitType(simdBaseJitType); + node->Op(1) = op1; + + if ((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT)) + { + GenTree* castNode = comp->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(node, castNode); + + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, castNode); + } + else + { + *use = castNode; + } + + // Adjust the parent stack + assert(parents.Top() == node); + (void)parents.Pop(); + parents.Push(castNode); + } +#elif defined(TARGET_XARCH) + NamedIntrinsic moveMaskIntrinsic = NI_Illegal; + NamedIntrinsic shuffleIntrinsic = NI_Illegal; + + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; + + // We want to tightly pack the most significant byte of each short/ushort + // and then zero the tightly packed least significant bytes + // + // The most significant bit being set means zero the value + + simd_t simdVal = {}; + + simdVal.u64[0] = 0x0F0D0B0907050301; + simdVal.u64[1] = 0x8080808080808080; + + if (simdSize == 32) + { + // Vector256 works on 2x128-bit lanes, so repeat the same indices for the upper lane + + simdVal.u64[2] = 0x0F0D0B0907050301; + simdVal.u64[3] = 0x8080808080808080; + + shuffleIntrinsic = NI_AVX2_Shuffle; + moveMaskIntrinsic = NI_X86Base_MoveMask; + } + else + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42)); + + shuffleIntrinsic = NI_SSE42_Shuffle; + moveMaskIntrinsic = NI_X86Base_MoveMask; + } + + GenTree* op2 = comp->gtNewVconNode(simdType); + memcpy(&op2->AsVecCon()->gtSimdVal, &simdVal, simdSize); + BlockRange().InsertAfter(op1, op2); + + GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, shuffleIntrinsic, simdBaseJitType, simdSize); + BlockRange().InsertAfter(op2, tmp); + op1 = tmp; + + if (simdSize == 32) + { + CorInfoType simdOtherJitType; + + // Since Vector256 is 2x128-bit lanes we need a full width permutation so we get the lower + // 64-bits of each lane next to eachother. The upper bits should be zero, but also don't + // matter so we can also then simplify down to a 128-bit move mask. + + simdOtherJitType = (simdBaseType == TYP_UBYTE) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + + GenTree* icon = comp->gtNewIconNode(0xD8); + BlockRange().InsertAfter(op1, icon); + + tmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, icon, NI_AVX2_Permute4x64, simdOtherJitType, simdSize); + BlockRange().InsertAfter(icon, tmp); + op1 = tmp; + + simdType = TYP_SIMD16; + + tmp = comp->gtNewSimdGetLowerNode(simdType, op1, simdBaseJitType, simdSize); + BlockRange().InsertAfter(op1, tmp); + op1 = tmp; + + simdSize = 16; + } + + node->ChangeHWIntrinsicId(moveMaskIntrinsic); + node->SetSimdSize(simdSize); + node->SetSimdBaseJitType(simdBaseJitType); + node->Op(1) = op1; +#else + unreached(); +#endif +} #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64 diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index b09bb04da5f1f6..a4d18f25f708d3 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -63,6 +63,8 @@ class Rationalizer final : public Phase bool ShouldRewriteToNonMaskHWIntrinsic(GenTree* node); #endif // TARGET_XARCH + + void RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64 diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index f6da9993f90d45..afcd30925a1568 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -582,6 +582,68 @@ inline void EvaluateUnaryMask( } } } + +template +inline void EvaluateExtractMSB(simdmask_t* result, const TSimd& arg0) +{ + uint64_t resultValue = 0; + uint32_t count = sizeof(TSimd) / sizeof(TBase); + + for (uint32_t i = 0; i < count; i++) + { + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); + + if (input0 < 0) + { + resultValue |= (static_cast(1) << i); + } + } + + memcpy(&result->u64[0], &resultValue, sizeof(uint64_t)); +} + +template +inline void EvaluateExtractMSB(var_types baseType, simdmask_t* result, const TSimd& arg0) +{ + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + EvaluateExtractMSB(result, arg0); + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + EvaluateExtractMSB(result, arg0); + break; + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + EvaluateExtractMSB(result, arg0); + break; + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + EvaluateExtractMSB(result, arg0); + break; + } + + default: + { + unreached(); + } + } +} #endif // FEATURE_MASKED_HW_INTRINSICS template diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 6cc89a08a817cb..f0b286a6533886 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7907,6 +7907,80 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, switch (ni) { +#if defined(TARGET_ARM64) + case NI_Vector64_ExtractMostSignificantBits: +#elif defined(TARGET_XARCH) + case NI_Vector256_ExtractMostSignificantBits: + case NI_X86Base_MoveMask: + case NI_AVX_MoveMask: + case NI_AVX2_MoveMask: +#endif + case NI_Vector128_ExtractMostSignificantBits: + { + simdmask_t simdMaskVal; + + switch (simdSize) + { + case 8: + { + simd8_t arg0 = GetConstantSimd8(arg0VN); + EvaluateExtractMSB(baseType, &simdMaskVal, arg0); + break; + } + + case 16: + { + simd16_t arg0 = GetConstantSimd16(arg0VN); + EvaluateExtractMSB(baseType, &simdMaskVal, arg0); + break; + } + +#if defined(TARGET_XARCH) + case 32: + { + simd32_t arg0 = GetConstantSimd32(arg0VN); + EvaluateExtractMSB(baseType, &simdMaskVal, arg0); + break; + } + + case 64: + { + simd64_t arg0 = GetConstantSimd64(arg0VN); + EvaluateExtractMSB(baseType, &simdMaskVal, arg0); + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } + + uint64_t mask; + memcpy(&mask, &simdMaskVal.u64[0], sizeof(uint64_t)); + + uint32_t elemCount = simdSize / genTypeSize(baseType); + uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + + return VNForIntCon(static_cast(mask & bitMask)); + } + +#ifdef TARGET_XARCH + case NI_AVX512_MoveMask: + { + simdmask_t arg0 = GetConstantSimdMask(arg0VN); + + uint64_t mask; + memcpy(&mask, &arg0, sizeof(uint64_t)); + + uint32_t elemCount = simdSize / genTypeSize(baseType); + uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + + return VNForIntCon(static_cast(mask & bitMask)); + } +#endif // TARGET_XARCH + #ifdef TARGET_ARM64 case NI_ArmBase_LeadingZeroCount: #else From ac33bfe2807e348860134554ca5ab7d63efa48f9 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 15 Jul 2025 13:16:21 -0700 Subject: [PATCH 2/4] Ensure 64-bit masks create 64-bit constants when folded --- src/coreclr/jit/gentree.cpp | 22 ++++++++++++---------- src/coreclr/jit/valuenum.cpp | 23 ++++++++++++----------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9324401acd4149..b66c574c1b1083 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32625,12 +32625,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd32Val); break; } - - case 64: - { - EvaluateExtractMSB(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd64Val); - break; - } #endif // TARGET_XARCH default: @@ -32639,12 +32633,13 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } } - uint64_t mask; - memcpy(&mask, &simdMaskVal.u64[0], sizeof(uint64_t)); + uint32_t mask; + memcpy(&mask, &simdMaskVal.u32[0], sizeof(uint32_t)); uint32_t elemCount = simdSize / genTypeSize(simdBaseType); - uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + uint32_t bitMask = static_cast((1 << elemCount) - 1); + assert(elemCount <= 32); resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); break; } @@ -32658,7 +32653,14 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) uint32_t elemCount = simdSize / genTypeSize(simdBaseType); uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); - resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + if (elemCount <= 32) + { + resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + } + else + { + resultNode = gtNewLconNode(static_cast(mask & bitMask)); + } break; } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index f0b286a6533886..82c391969ce4c5 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7942,13 +7942,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, EvaluateExtractMSB(baseType, &simdMaskVal, arg0); break; } - - case 64: - { - simd64_t arg0 = GetConstantSimd64(arg0VN); - EvaluateExtractMSB(baseType, &simdMaskVal, arg0); - break; - } #endif // TARGET_XARCH default: @@ -7957,12 +7950,13 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, } } - uint64_t mask; - memcpy(&mask, &simdMaskVal.u64[0], sizeof(uint64_t)); + uint32_t mask; + memcpy(&mask, &simdMaskVal.u32[0], sizeof(uint32_t)); uint32_t elemCount = simdSize / genTypeSize(baseType); - uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + uint32_t bitMask = static_cast((1 << elemCount) - 1); + assert(elemCount <= 32); return VNForIntCon(static_cast(mask & bitMask)); } @@ -7977,7 +7971,14 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, uint32_t elemCount = simdSize / genTypeSize(baseType); uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); - return VNForIntCon(static_cast(mask & bitMask)); + if (elemCount <= 32) + { + return VNForIntCon(static_cast(mask & bitMask)); + } + else + { + return VNForLongCon(static_cast(mask & bitMask)); + } } #endif // TARGET_XARCH From c9ad82d4fe6abc2bed9cb0bdfd0acead7acaee07 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 15 Jul 2025 15:04:02 -0700 Subject: [PATCH 3/4] Handle the fact that V512.EMSB always returns TYP_LONG --- src/coreclr/jit/gentree.cpp | 6 +++++- src/coreclr/jit/valuenum.cpp | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b66c574c1b1083..e1f37b1809e183 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32639,7 +32639,9 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) uint32_t elemCount = simdSize / genTypeSize(simdBaseType); uint32_t bitMask = static_cast((1 << elemCount) - 1); + assert(varTypeIsInt(retType)); assert(elemCount <= 32); + resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); break; } @@ -32653,12 +32655,14 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) uint32_t elemCount = simdSize / genTypeSize(simdBaseType); uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); - if (elemCount <= 32) + if (varTypeIsInt(retType)) { + assert(elemCount <= 32); resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); } else { + assert(varTypeIsLong(retType)); resultNode = gtNewLconNode(static_cast(mask & bitMask)); } break; diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 82c391969ce4c5..aec6febb9876c2 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7956,7 +7956,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, uint32_t elemCount = simdSize / genTypeSize(baseType); uint32_t bitMask = static_cast((1 << elemCount) - 1); + assert(varTypeIsInt(type)); assert(elemCount <= 32); + return VNForIntCon(static_cast(mask & bitMask)); } @@ -7971,12 +7973,14 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, uint32_t elemCount = simdSize / genTypeSize(baseType); uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); - if (elemCount <= 32) + if (varTypeIsInt(type)) { + assert(elemCount <= 32); return VNForIntCon(static_cast(mask & bitMask)); } else { + assert(varTypeIsLong(type)); return VNForLongCon(static_cast(mask & bitMask)); } } From 16db5f55051fdc9b2336d8b70eccffe63c65baf6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 15 Jul 2025 19:22:56 -0700 Subject: [PATCH 4/4] Expose a GetRawBits and GetBitMask helper on simdmask_t to ensure we get valid data --- src/coreclr/jit/gentree.cpp | 16 ++++------ src/coreclr/jit/hwintrinsicxarch.cpp | 6 ++-- src/coreclr/jit/simd.h | 45 +++++++++++++++++----------- src/coreclr/jit/valuenum.cpp | 16 ++++------ 4 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e1f37b1809e183..b3ce16e0e01e4e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32633,37 +32633,33 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } } - uint32_t mask; - memcpy(&mask, &simdMaskVal.u32[0], sizeof(uint32_t)); - uint32_t elemCount = simdSize / genTypeSize(simdBaseType); - uint32_t bitMask = static_cast((1 << elemCount) - 1); + uint64_t mask = simdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount); assert(varTypeIsInt(retType)); assert(elemCount <= 32); - resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + resultNode = gtNewIconNode(static_cast(mask)); break; } #ifdef TARGET_XARCH case NI_AVX512_MoveMask: { - uint64_t mask; - memcpy(&mask, &cnsNode->AsMskCon()->gtSimdMaskVal.u64[0], sizeof(uint64_t)); + GenTreeMskCon* mskCns = cnsNode->AsMskCon(); uint32_t elemCount = simdSize / genTypeSize(simdBaseType); - uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + uint64_t mask = mskCns->gtSimdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount); if (varTypeIsInt(retType)) { assert(elemCount <= 32); - resultNode = gtNewIconNode(static_cast(mask & bitMask), retType); + resultNode = gtNewIconNode(static_cast(mask)); } else { assert(varTypeIsLong(retType)); - resultNode = gtNewLconNode(static_cast(mask & bitMask)); + resultNode = gtNewLconNode(static_cast(mask)); } break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index c75ff489b211da..c614918b2074d5 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2406,7 +2406,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { op1 = impSIMDPopStack(); - op1 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize); + op1 = gtFoldExpr(gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize)); retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); break; } @@ -4945,7 +4945,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } } intrinsic = NI_AVX512_BlendVariableMask; - op3 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize); + op3 = gtFoldExpr(gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize)); } retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); break; @@ -5483,7 +5483,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { retType = getSIMDTypeForSize(simdSize); assert(retType == getSIMDTypeForSize(getSIMDTypeSizeInBytes(sig->retTypeSigClass))); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, gtFoldExpr(retNode), simdBaseJitType, simdSize); } else if (isMinMaxIntrinsic) { diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index afcd30925a1568..f377ec6de02520 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -56,7 +56,7 @@ struct simd8_t { simd8_t result; - result.u64[0] = 0xFFFFFFFFFFFFFFFF; + result.u64[0] = UINT64_MAX; return result; } @@ -113,9 +113,9 @@ struct simd12_t { simd12_t result; - result.u32[0] = 0xFFFFFFFF; - result.u32[1] = 0xFFFFFFFF; - result.u32[2] = 0xFFFFFFFF; + result.u32[0] = UINT32_MAX; + result.u32[1] = UINT32_MAX; + result.u32[2] = UINT32_MAX; return result; } @@ -322,7 +322,7 @@ struct simdmask_t bool operator==(const simdmask_t& other) const { - return (u64[0] == other.u64[0]); + return GetRawBits() == other.GetRawBits(); } bool operator!=(const simdmask_t& other) const @@ -330,19 +330,25 @@ struct simdmask_t return !(*this == other); } - static simdmask_t AllBitsSet(unsigned elementCount) + static uint64_t GetBitMask(uint32_t elementCount) { assert((elementCount >= 1) && (elementCount <= 64)); - simdmask_t result; if (elementCount == 64) { - result.u64[0] = 0xFFFFFFFFFFFFFFFF; + return UINT64_MAX; } else { - result.u64[0] = (1ULL << elementCount) - 1; + return (1ULL << elementCount) - 1; } + } + + static simdmask_t AllBitsSet(uint32_t elementCount) + { + simdmask_t result; + + result.u64[0] = GetBitMask(elementCount); return result; } @@ -357,6 +363,13 @@ struct simdmask_t return *this == Zero(); } + uint64_t GetRawBits() const + { + uint64_t value; + memcpy(&value, &u64[0], sizeof(uint64_t)); + return value; + } + static simdmask_t Zero() { return {}; @@ -469,7 +482,7 @@ void EvaluateUnaryMask(genTreeOps oper, bool scalar, unsigned simdSize, simdmask } assert((count == 8) || (count == 16) || (count == 32) || (count == 64)); - uint64_t bitMask = static_cast((static_cast(1) << count) - 1); + uint64_t bitMask = simdmask_t::GetBitMask(count); #elif defined(TARGET_ARM64) // For Arm64 we have count total bits to write, but they are sizeof(TBase) bits apart uint64_t bitMask; @@ -509,8 +522,7 @@ void EvaluateUnaryMask(genTreeOps oper, bool scalar, unsigned simdSize, simdmask #error Unsupported platform #endif - uint64_t arg0Value; - memcpy(&arg0Value, &arg0.u64[0], sizeof(simdmask_t)); + uint64_t arg0Value = arg0.GetRawBits(); // We're only considering these bits arg0Value &= bitMask; @@ -1121,7 +1133,7 @@ void EvaluateBinaryMask( } assert((count == 8) || (count == 16) || (count == 32) || (count == 64)); - uint64_t bitMask = static_cast((static_cast(1) << count) - 1); + uint64_t bitMask = simdmask_t::GetBitMask(count); #elif defined(TARGET_ARM64) // For Arm64 we have count total bits to write, but they are sizeof(TBase) bits apart uint64_t bitMask; @@ -1161,11 +1173,8 @@ void EvaluateBinaryMask( #error Unsupported platform #endif - uint64_t arg0Value; - memcpy(&arg0Value, &arg0.u64[0], sizeof(simdmask_t)); - - uint64_t arg1Value; - memcpy(&arg1Value, &arg1.u64[0], sizeof(simdmask_t)); + uint64_t arg0Value = arg0.GetRawBits(); + uint64_t arg1Value = arg1.GetRawBits(); // We're only considering these bits arg0Value &= bitMask; diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index aec6febb9876c2..9bbbfccf208da4 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7950,16 +7950,13 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, } } - uint32_t mask; - memcpy(&mask, &simdMaskVal.u32[0], sizeof(uint32_t)); - uint32_t elemCount = simdSize / genTypeSize(baseType); - uint32_t bitMask = static_cast((1 << elemCount) - 1); + uint64_t mask = simdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount); assert(varTypeIsInt(type)); assert(elemCount <= 32); - return VNForIntCon(static_cast(mask & bitMask)); + return VNForIntCon(static_cast(mask)); } #ifdef TARGET_XARCH @@ -7967,21 +7964,18 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, { simdmask_t arg0 = GetConstantSimdMask(arg0VN); - uint64_t mask; - memcpy(&mask, &arg0, sizeof(uint64_t)); - uint32_t elemCount = simdSize / genTypeSize(baseType); - uint64_t bitMask = static_cast((static_cast(1) << elemCount) - 1); + uint64_t mask = arg0.GetRawBits() & simdmask_t::GetBitMask(elemCount); if (varTypeIsInt(type)) { assert(elemCount <= 32); - return VNForIntCon(static_cast(mask & bitMask)); + return VNForIntCon(static_cast(mask)); } else { assert(varTypeIsLong(type)); - return VNForLongCon(static_cast(mask & bitMask)); + return VNForLongCon(static_cast(mask)); } } #endif // TARGET_XARCH