From 7835f7253821a23f8e7d93badce6d2803d9b8e13 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 18 Jun 2025 13:25:01 +0100 Subject: [PATCH 01/28] Arm64 SVE: Fix conditionalselect with constant arguments Fixes #116847 When folding, allow arg1 to be a constant mask --- src/coreclr/jit/gentree.cpp | 100 +++++++++++++++++- src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/simd.h | 100 +++++++++++++++++- .../JIT/opt/SVE/ConditionalSelectConstants.cs | 99 +++++++++++++++++ .../opt/SVE/ConditionalSelectConstants.csproj | 12 +++ 5 files changed, 307 insertions(+), 5 deletions(-) create mode 100644 src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs create mode 100644 src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 75ddbddb0fcf24..58344f07de6149 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18608,6 +18608,84 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type } } +//------------------------------------------------------------------------ +// GenTreeVecCon::EvaluateUnaryInPlace: Evaluates this constant using the given operation, when the other +// operand is a constant mask +// +// Arguments: +// oper - the operation to use in the evaluation +// scalar - true if this is a scalar operation; otherwise, false +// baseType - the base type of the constant being checked +// other - the mask constant to use in the evaluation +// +void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other) +{ + switch (gtType) + { + case TYP_SIMD8: + { + simd8_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + + simd8_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd8Val, otherSimdVal); + gtSimd8Val = result; + break; + } + + case TYP_SIMD12: + { + simd12_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + + simd12_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd12Val, otherSimdVal); + gtSimd12Val = result; + break; + } + + case TYP_SIMD16: + { + simd16_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + + simd16_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd16Val, otherSimdVal); + gtSimd16Val = result; + break; + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + { + simd32_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + + simd32_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd32Val, otherSimdVal); + gtSimd32Val = result; + break; + } + + case TYP_SIMD64: + { + simd64_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + + simd64_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd64Val, otherSimdVal); + gtSimd64Val = result; + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } +} + //------------------------------------------------------------------------ // GenTreeVecCon::EvaluateBroadcastInPlace: Evaluates this constant using a broadcast // @@ -32838,12 +32916,26 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) if (op2->IsCnsVec() && op3->IsCnsVec()) { - // op2 = op2 & op1 - op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsVecCon()); + if (op1->IsCnsVec()) + { + // op2 = op2 & op1 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsVecCon()); + + // op3 = op2 & ~op1 + op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsVecCon()); + } +#if defined(TARGET_ARM64) + else if (op1->IsCnsMsk()) + { + assert(ni == NI_Sve_ConditionalSelect); - // op3 = op2 & ~op1 - op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsVecCon()); + // op2 = op2 & op1 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsMskCon()); + // op3 = op2 & ~op1 + op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsMskCon()); + } +#endif // op2 = op2 | op3 op2->AsVecCon()->EvaluateBinaryInPlace(GT_OR, false, simdBaseType, op3->AsVecCon()); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c5d49fbacfca3a..3436b281c9bde5 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6851,6 +6851,7 @@ struct GenTreeVecCon : public GenTree void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType); void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeVecCon* other); + void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other); template void EvaluateBroadcastInPlace(TBase scalar) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9841bdeb38c93c..9c297347f2ece6 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1906,7 +1906,7 @@ SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0) memcpy(&mask, &arg0.u8[0], sizeof(uint64_t)); uint32_t finalOne = count; - // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s. + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -1993,6 +1993,104 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) } } } + +template +SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) +{ + uint32_t count = sizeof(TSimd) / sizeof(TBase); + uint32_t finalOne = count; + + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. + // This pattern is extracted using the least significant bits of the vector elements. + + // Find an unbroken sequence of 1s. + for (uint32_t i = 0; i < count; i++) + { + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); + + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element + // has its least significant bit set + + bool isSet = static_cast(1) << (i * sizeof(TBase)); + if (!isSet) + { + finalOne = i; + break; + } + } + + // Find an unbroken sequence of 0s. + for (uint32_t i = finalOne; i < count; i++) + { + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element + // has its least significant bit set + + bool isSet = static_cast(1) << (i * sizeof(TBase)); + if (isSet) + { + // Invalid sequence + return SveMaskPatternNone; + } + } + + if (finalOne == count) + { + return SveMaskPatternAll; + } + else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8) + { + return (SveMaskPattern)finalOne; + } + else + { + // TODO: Add other patterns as required. These probably won't be seen until we get + // to wider vector lengths. + return SveMaskPatternNone; + } +} + +template +SveMaskPattern EvaluateSimdVectorToPattern(var_types baseType, TSimd arg0) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + default: + { + unreached(); + } + } +} #endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs new file mode 100644 index 00000000000000..d2424e951c7dc7 --- /dev/null +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs @@ -0,0 +1,99 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using Xunit; + +public class ConditionalSelectConstants +{ + [MethodImpl(MethodImplOptions.NoInlining)] + [Fact] + public static int TestConditionalSelectConstants() + { + bool fail = false; + + if (Sve.IsSupported) + { + var r1 = Sve.AddAcross(ConditionalSelect1CC()); + Console.WriteLine(r1[0]); + if (r1[0] != 15) + { + fail = true; + } + + var r2 = Sve.AddAcross(ConditionalSelect1FT()); + Console.WriteLine(r2[0]); + if (r2[0] != -3) + { + fail = true; + } + + var r3 = Sve.AddAcross(ConditionalSelect16TF()); + Console.WriteLine(r3[0]); + if (r3[0] != 4080) + { + fail = true; + } + + var r4 = Sve.AddAcross(ConditionalSelect2CT()); + Console.WriteLine(r4[0]); + if (r4[0] != 16) + { + fail = true; + } + } + + if (fail) + { + return 101; + } + return 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelect1CC() + { + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount1), + Vector.Create(3), + Vector.Create(4) + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelect1FT() + { + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount1), + Sve.CreateFalseMaskInt32(), + Sve.CreateTrueMaskInt32() + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelect16TF() + { + return Sve.ConditionalSelect( + Sve.CreateTrueMaskByte(SveMaskPattern.VectorCount16), + Sve.CreateTrueMaskByte(), + Sve.CreateFalseMaskByte() + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelect2CT() + { + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount2), + Vector.Create(9), + Sve.CreateTrueMaskInt32() + ); + } + + +} \ No newline at end of file diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj new file mode 100644 index 00000000000000..06e13807788085 --- /dev/null +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj @@ -0,0 +1,12 @@ + + + true + True + + + + + + + + From 18340c8bdc196cdb15613b4a630614830c782027 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 09:44:09 +0100 Subject: [PATCH 02/28] Make masked EvaluateBinaryInPlace() Arm64 only --- src/coreclr/jit/gentree.cpp | 70 +++++-------------------------------- src/coreclr/jit/gentree.h | 2 ++ 2 files changed, 10 insertions(+), 62 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 58344f07de6149..81a7a7ffd1a013 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18608,6 +18608,7 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type } } +#if defined(TARGET_ARM64) //------------------------------------------------------------------------ // GenTreeVecCon::EvaluateUnaryInPlace: Evaluates this constant using the given operation, when the other // operand is a constant mask @@ -18620,71 +18621,16 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type // void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other) { - switch (gtType) - { - case TYP_SIMD8: - { - simd8_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); - - simd8_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd8Val, otherSimdVal); - gtSimd8Val = result; - break; - } - - case TYP_SIMD12: - { - simd12_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); + assert(gtType == TYP_SIMD16); - simd12_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd12Val, otherSimdVal); - gtSimd12Val = result; - break; - } - - case TYP_SIMD16: - { - simd16_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); - - simd16_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd16Val, otherSimdVal); - gtSimd16Val = result; - break; - } + simd16_t otherSimdVal; + EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); - - simd32_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd32Val, otherSimdVal); - gtSimd32Val = result; - break; - } - - case TYP_SIMD64: - { - simd64_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); - - simd64_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd64Val, otherSimdVal); - gtSimd64Val = result; - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } + simd16_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd16Val, otherSimdVal); + gtSimd16Val = result; } +#endif // TARGET_ARM64 //------------------------------------------------------------------------ // GenTreeVecCon::EvaluateBroadcastInPlace: Evaluates this constant using a broadcast diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 3436b281c9bde5..092a8cea0e7a0e 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6851,7 +6851,9 @@ struct GenTreeVecCon : public GenTree void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType); void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeVecCon* other); +#if defined(TARGET_ARM64) void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other); +#endif template void EvaluateBroadcastInPlace(TBase scalar) From 0d60a5e2889906d8e10c181a9b3e1a58c8ca404e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 11:32:35 +0100 Subject: [PATCH 03/28] Check significantBit in EvaluateSimdVectorToPattern() --- src/coreclr/jit/simd.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9c297347f2ece6..7c5740762971fa 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2000,22 +2000,21 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; + TBase significantBit = 1; + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. + // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element has its least significant bit set + // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) { TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - // For Arm64 we have count total bits to read, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - - bool isSet = static_cast(1) << (i * sizeof(TBase)); - if (!isSet) + if ((input0 & significantBit) != 0) { finalOne = i; break; @@ -2025,13 +2024,10 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) // Find an unbroken sequence of 0s. for (uint32_t i = finalOne; i < count; i++) { - // For Arm64 we have count total bits to read, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = static_cast(1) << (i * sizeof(TBase)); - if (isSet) + if ((input0 & significantBit) != 0) { // Invalid sequence return SveMaskPatternNone; From 0a360256887f4c46173a14b830a56e3e1d2204c1 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 12:27:43 +0100 Subject: [PATCH 04/28] fix set checks in EvaluateSimdVectorToPattern --- src/coreclr/jit/simd.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 7c5740762971fa..c1d47536c1dcb9 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2014,7 +2014,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & significantBit) != 0) + bool isSet = (input0 & significantBit) != 0; + if (!isSet) { finalOne = i; break; @@ -2027,7 +2028,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & significantBit) != 0) + bool isSet = (input0 & significantBit) != 0; + if (isSet) { // Invalid sequence return SveMaskPatternNone; From 59107d0e489287958b009795d2c8dfafe655f1aa Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 09:06:15 +0100 Subject: [PATCH 05/28] Use masks in EvalHWIntrinsicFunTernary() for SVE conditionalselect --- src/coreclr/jit/valuenum.cpp | 38 +++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index d1ea9ee955e5f3..3ed13d602100a1 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -3935,6 +3935,7 @@ int ValueNumStore::GetConstantInt32(ValueNum argVN) result = (int)ConstantValue(argVN); break; #endif + default: unreached(); } @@ -7868,6 +7869,30 @@ ValueNum EvaluateSimdCvtVectorToMask(ValueNumStore* vns, var_types simdType, var return vns->VNForSimdMaskCon(result); } +#if defined(TARGET_ARM64) + +ValueNum EvaluateBinarySimdAndMask(ValueNumStore* vns, + genTreeOps oper, + bool scalar, + var_types simdType, + var_types baseType, + ValueNum arg0VN, + ValueNum arg1VNMask) +{ + assert(simdType == TYP_SIMD16); + + simd16_t arg0 = GetConstantSimd16(vns, baseType, arg0VN); + + ValueNum arg1VNSimd = EvaluateSimdCvtMaskToVector(vns, simdType, baseType, arg1VNMask); + simd16_t arg1 = GetConstantSimd16(vns, baseType, arg1VNSimd); + + simd16_t result = {}; + EvaluateBinarySimd(oper, scalar, baseType, &result, arg0, arg1); + return vns->VNForSimd16Con(result); +} + +#endif // TARGET_ARM64 + ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, VNFunc func, ValueNum arg0VN, @@ -9145,9 +9170,20 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary( { // (y & x) | (z & ~x) +#if defined(TARGET_ARM64) + if (ni == NI_Sve_ConditionalSelect) + { + assert(TypeOfVN(arg0VN) == TYP_MASK); + ValueNum trueVN = + EvaluateBinarySimdAndMask(this, GT_AND, false, type, baseType, arg1VN, arg0VN); + ValueNum falseVN = + EvaluateBinarySimdAndMask(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); + return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); + } +#endif // TARGET_ARM64 + ValueNum trueVN = EvaluateBinarySimd(this, GT_AND, false, type, baseType, arg1VN, arg0VN); ValueNum falseVN = EvaluateBinarySimd(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); - return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); } } From b923b28fa28b61a43df53be13284f48ff0ac0477 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 09:07:32 +0100 Subject: [PATCH 06/28] Check all of a vector lane when converting to mask --- src/coreclr/jit/simd.h | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index c1d47536c1dcb9..a3e5b754f1fcea 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1598,9 +1598,8 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; - TBase significantBit = 1; #if defined(TARGET_XARCH) - significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + TBase significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); #endif for (uint32_t i = 0; i < count; i++) @@ -1608,25 +1607,24 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); +#if defined(TARGET_XARCH) + // For xarch we have count sequential bits to write depending on if the + // corresponding the input element has its most significant bit set if ((input0 & significantBit) != 0) { -#if defined(TARGET_XARCH) - // For xarch we have count sequential bits to write - // depending on if the corresponding the input element - // has its most significant bit set - mask |= static_cast(1) << i; + } #elif defined(TARGET_ARM64) - // For Arm64 we have count total bits to write, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - + // For Arm64 we have count total bits to write, but they are sizeof(TBase) + // bits apart. We set depending on if the corresponding input element has + // any bit set (this matches the use of cmpne in outputted assembly). + if (input0 != 0) + { mask |= static_cast(1) << (i * sizeof(TBase)); + } #else - unreached(); + unreached(); #endif - } } memcpy(&result->u8[0], &mask, sizeof(uint64_t)); @@ -2000,13 +1998,12 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; - TBase significantBit = 1; - // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element has its least significant bit set + // depending on if the corresponding input element has any bit set (this matches the use + // of cmpne in outputted assembly) // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -2014,7 +2011,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = input0 != 0; if (!isSet) { finalOne = i; @@ -2028,7 +2025,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = input0 != 0; if (isSet) { // Invalid sequence From c01bc22cc0ee85104a88e90f0c8ef186a1f38df5 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 09:38:01 +0100 Subject: [PATCH 07/28] Add testing for EvalHWIntrinsicFunTernary changes --- .../JIT/opt/SVE/ConditionalSelectConstants.cs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs index d2424e951c7dc7..fa9a479395ec42 100644 --- a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs @@ -46,6 +46,13 @@ public static int TestConditionalSelectConstants() { fail = true; } + + var r5 = ConditionalSelectConsts(); + Console.WriteLine(r5); + if (r5 != 5) + { + fail = true; + } } if (fail) @@ -95,5 +102,12 @@ static Vector ConditionalSelect2CT() ); } - -} \ No newline at end of file + [MethodImpl(MethodImplOptions.NoInlining)] + static sbyte ConditionalSelectConsts() + { + var vec = Sve.ConditionalSelect(Vector128.CreateScalar((sbyte)49).AsVector(), + Vector128.CreateScalar((sbyte)0).AsVector(), + Vector.Create(107)); + return Sve.ConditionalExtractLastActiveElement(Vector128.CreateScalar((sbyte)0).AsVector(), 5, vec); + } +} From f9c6dd628e67ca676f87906f4dabdc103a894f40 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 09:41:51 +0100 Subject: [PATCH 08/28] whitespace --- src/coreclr/jit/valuenum.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 3ed13d602100a1..51a6fc63f6c273 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -3935,7 +3935,6 @@ int ValueNumStore::GetConstantInt32(ValueNum argVN) result = (int)ConstantValue(argVN); break; #endif - default: unreached(); } @@ -7870,7 +7869,6 @@ ValueNum EvaluateSimdCvtVectorToMask(ValueNumStore* vns, var_types simdType, var } #if defined(TARGET_ARM64) - ValueNum EvaluateBinarySimdAndMask(ValueNumStore* vns, genTreeOps oper, bool scalar, @@ -7890,7 +7888,6 @@ ValueNum EvaluateBinarySimdAndMask(ValueNumStore* vns, EvaluateBinarySimd(oper, scalar, baseType, &result, arg0, arg1); return vns->VNForSimd16Con(result); } - #endif // TARGET_ARM64 ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, @@ -9174,16 +9171,19 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary( if (ni == NI_Sve_ConditionalSelect) { assert(TypeOfVN(arg0VN) == TYP_MASK); + ValueNum trueVN = EvaluateBinarySimdAndMask(this, GT_AND, false, type, baseType, arg1VN, arg0VN); ValueNum falseVN = EvaluateBinarySimdAndMask(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); + return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); } #endif // TARGET_ARM64 ValueNum trueVN = EvaluateBinarySimd(this, GT_AND, false, type, baseType, arg1VN, arg0VN); ValueNum falseVN = EvaluateBinarySimd(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); + return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); } } From 802ae0d32ff85b2866f62530c32fd8db7810fa98 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 16:51:51 +0100 Subject: [PATCH 09/28] Revert "Check all of a vector lane when converting to mask" This reverts commit b923b28fa28b61a43df53be13284f48ff0ac0477. --- src/coreclr/jit/simd.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index a3e5b754f1fcea..c1d47536c1dcb9 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1598,8 +1598,9 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; + TBase significantBit = 1; #if defined(TARGET_XARCH) - TBase significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); #endif for (uint32_t i = 0; i < count; i++) @@ -1607,24 +1608,25 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); -#if defined(TARGET_XARCH) - // For xarch we have count sequential bits to write depending on if the - // corresponding the input element has its most significant bit set if ((input0 & significantBit) != 0) { +#if defined(TARGET_XARCH) + // For xarch we have count sequential bits to write + // depending on if the corresponding the input element + // has its most significant bit set + mask |= static_cast(1) << i; - } #elif defined(TARGET_ARM64) - // For Arm64 we have count total bits to write, but they are sizeof(TBase) - // bits apart. We set depending on if the corresponding input element has - // any bit set (this matches the use of cmpne in outputted assembly). - if (input0 != 0) - { + // For Arm64 we have count total bits to write, but + // they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element + // has its least significant bit set + mask |= static_cast(1) << (i * sizeof(TBase)); - } #else - unreached(); + unreached(); #endif + } } memcpy(&result->u8[0], &mask, sizeof(uint64_t)); @@ -1998,12 +2000,13 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; + TBase significantBit = 1; + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element has any bit set (this matches the use - // of cmpne in outputted assembly) + // depending on if the corresponding input element has its least significant bit set // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -2011,7 +2014,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = input0 != 0; + bool isSet = (input0 & significantBit) != 0; if (!isSet) { finalOne = i; @@ -2025,7 +2028,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = input0 != 0; + bool isSet = (input0 & significantBit) != 0; if (isSet) { // Invalid sequence From 3c3cb8fa882332a02946039a4d3d4a8c9aeb0713 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Jun 2025 18:23:39 +0100 Subject: [PATCH 10/28] rename significantBit to leastSignificantBit --- src/coreclr/jit/simd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index c1d47536c1dcb9..aea1072defd240 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2000,7 +2000,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; - TBase significantBit = 1; + TBase leastSignificantBit = 1; // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. @@ -2014,7 +2014,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = (input0 & leastSignificantBit) != 0; if (!isSet) { finalOne = i; @@ -2028,7 +2028,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = (input0 & leastSignificantBit) != 0; if (isSet) { // Invalid sequence From c96e38c7ed6f2bfdce99f9d073d36b13c375be7f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 25 Jun 2025 16:26:16 +0100 Subject: [PATCH 11/28] Use LSB of vector when converting from vector to mask --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 13 +++++++++++-- src/coreclr/jit/lsraarm64.cpp | 6 ++++++ src/coreclr/jit/optimizemaskconversions.cpp | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index ca38c26ab7c845..f109f0b2ef1d89 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2010,10 +2010,19 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; case NI_Sve_ConvertVectorToMask: + { // PMOV would be ideal here, but it is in SVE2.1. - // Instead, use a compare: CMPNE ., /Z, ., #0 - GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt); + // + // Instead, to test if lowest bit is set, we LSL elementWidthInBits - 1 + // and then compare: CMPNE ., /Z, ., #0 + + int elementWidthInBits = 8 * genTypeSize(intrin.baseType); + regNumber ztemp = internalRegisters.GetSingle(node); + + GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, ztemp, op2Reg, elementWidthInBits - 1, opt); + GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, ztemp, 0, opt); break; + } case NI_Sve_Count16BitElements: case NI_Sve_Count32BitElements: diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 06d832d10307ba..3b304eec88a13e 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1415,6 +1415,12 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou delayFreeOp = getDelayFreeOperand(embeddedOp, /* embedded */ true); } + if (intrin.id == NI_Sve_ConvertVectorToMask) + { + // Need an extra temp to test LSB of source vector + buildInternalFloatRegisterDefForNode(intrinsicTree); + } + // Build any immediates BuildHWIntrinsicImmediate(intrinsicTree, intrin); diff --git a/src/coreclr/jit/optimizemaskconversions.cpp b/src/coreclr/jit/optimizemaskconversions.cpp index b328e884637e79..131a52a8e34db9 100644 --- a/src/coreclr/jit/optimizemaskconversions.cpp +++ b/src/coreclr/jit/optimizemaskconversions.cpp @@ -20,8 +20,8 @@ struct MaskConversionsWeight static constexpr const weight_t costOfConvertMaskToVector = 1.0; #if defined(TARGET_ARM64) - // Conversion of vector to mask is two instructions. - static constexpr const weight_t costOfConvertVectorToMask = 2.0; + // Conversion of vector to mask is three instructions. + static constexpr const weight_t costOfConvertVectorToMask = 3.0; #else // Conversion of vector to mask is one instructions. static constexpr const weight_t costOfConvertVectorToMask = 1.0; From 9d2cebdb7bc23a8a436fa30e7c25a8b221bcf79e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 09:54:01 +0100 Subject: [PATCH 12/28] Add LowerCnsMask --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/gentree.cpp | 8 ++-- src/coreclr/jit/lower.cpp | 5 +++ src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerarmarch.cpp | 73 ++++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9629ca8725dbc9..c271c7b8643408 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3205,7 +3205,7 @@ class Compiler unsigned simdSize); #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTree* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); + GenTreeHWIntrinsic* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewSimdCreateBroadcastNode( diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 81a7a7ffd1a013..012b660b0ccff3 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21971,10 +21971,10 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, // Return Value: // The node converted to the a mask type // -GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTreeHWIntrinsic* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize) { assert(varTypeIsMask(type)); assert(varTypeIsSIMD(op1)); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index e1025850ef7c8f..4d49f734f36a2a 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -789,6 +789,11 @@ GenTree* Lowering::LowerNode(GenTree* node) LowerReturnSuspend(node); break; +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) + case GT_CNS_MSK: + return LowerCnsMask(node->AsMskCon()); +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + default: break; } diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 8add7cdc88b95c..69cb4dde4cd842 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -454,6 +454,7 @@ class Lowering final : public Phase bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); void LowerModPow2(GenTree* node); + GenTree* LowerCnsMask(GenTreeMskCon* mask); bool TryLowerAddForPossibleContainment(GenTreeOp* node, GenTree** next); void StoreFFRValue(GenTreeHWIntrinsic* node); #endif // !TARGET_XARCH && !TARGET_ARM64 diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 1d69b329e760bd..e689fe640b89ea 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1134,6 +1134,79 @@ void Lowering::LowerModPow2(GenTree* node) ContainCheckNode(mod); } +//------------------------------------------------------------------------ +// LowerCnsMask: Lower GT_CNS_MSK. Ensure the mask matches a known pattern. +// If not then lower to a constant vector. +// +// Arguments: +// mask - the node to lower +// +GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) +{ + // Try every type until a match is found + + if (mask->IsZero()) + { + return mask->gtNext; + } + + if (EvaluateSimdMaskToPattern(TYP_BYTE, mask->gtSimdMaskVal) != SveMaskPatternNone) + { + return mask->gtNext; + } + + if (EvaluateSimdMaskToPattern(TYP_SHORT, mask->gtSimdMaskVal) != SveMaskPatternNone) + { + return mask->gtNext; + } + + if (EvaluateSimdMaskToPattern(TYP_INT, mask->gtSimdMaskVal) != SveMaskPatternNone) + { + return mask->gtNext; + } + + if (EvaluateSimdMaskToPattern(TYP_LONG, mask->gtSimdMaskVal) != SveMaskPatternNone) + { + return mask->gtNext; + } + + // Not a valid pattern, so cannot be created using ptrue/pfalse. Instead the mask will require + // loading from memory. There is no way to load to a predicate from memory using a PC relative + // address, so instead use a constant vector plus conversion to mask. + + LABELEDDISPTREERANGE("lowering cns mask to cns vector (before)", BlockRange(), mask); + + LIR::Use use; + if (!BlockRange().TryGetUse(mask, &use)) + { + unreached(); + } + assert(use.User()->OperIsHWIntrinsic()); + GenTreeHWIntrinsic *parent = use.User()->AsHWIntrinsic(); + + var_types parentBaseType = parent->GetSimdBaseType(); + CorInfoType parentSimdBaseJitType = parent->GetSimdBaseJitType(); + unsigned parentSimdSize = parent->GetSimdSize(); + assert(parent->GetSimdSize() == 16); + + GenTreeVecCon* vecCon = comp->gtNewVconNode(TYP_SIMD16); + EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); + BlockRange().InsertBefore(mask, vecCon); + + GenTreeHWIntrinsic *convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); + BlockRange().InsertBefore(mask, convertedVec->Op(1)); + BlockRange().InsertBefore(mask, convertedVec); + + use.ReplaceWith(convertedVec); + + BlockRange().Remove(mask); + + LABELEDDISPTREERANGE("lowering cns mask to cns vector (after)", BlockRange(), vecCon); + + return vecCon->gtNext; +} + + const int POST_INDEXED_ADDRESSING_MAX_DISTANCE = 16; //------------------------------------------------------------------------ From 70d601df3002d1da43559a8c878190f14ad88475 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 10:02:16 +0100 Subject: [PATCH 13/28] Add testcase --- .../JIT/opt/SVE/ConditionalSelectConstants.cs | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs index fa9a479395ec42..65799002ace45a 100644 --- a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs @@ -53,6 +53,13 @@ public static int TestConditionalSelectConstants() { fail = true; } + + var r6 = ConditionalSelectConstsNoMaskPattern(); + Console.WriteLine(r6); + if (r6 != false) + { + fail = true; + } } if (fail) @@ -102,6 +109,8 @@ static Vector ConditionalSelect2CT() ); } + // Valuenum will optimise away the ConditionalSelect. + [MethodImpl(MethodImplOptions.NoInlining)] static sbyte ConditionalSelectConsts() { @@ -110,4 +119,18 @@ static sbyte ConditionalSelectConsts() Vector.Create(107)); return Sve.ConditionalExtractLastActiveElement(Vector128.CreateScalar((sbyte)0).AsVector(), 5, vec); } + + // Valuenum will optimise away the ConditionalSelect. + // vr0 is a constant mask, but is not a known pattern. + + [MethodImpl(MethodImplOptions.NoInlining)] + public static bool ConditionalSelectConstsNoMaskPattern() + { + var vr2 = Vector128.CreateScalar(5653592783208606001L).AsVector(); + var vr3 = Vector128.CreateScalar(6475288982576452694L).AsVector(); + var vr4 = Vector.Create(1); + var vr0 = Sve.ConditionalSelect(vr2, vr3, vr4); + var vr5 = Vector128.CreateScalar((long)0).AsVector(); + return Sve.TestFirstTrue(vr0, vr5); + } } From 5428e1d6a851bbfedb0098060678e31d76881060 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 10:08:28 +0100 Subject: [PATCH 14/28] Remove EvaluateSimdMaskToPattern --- src/coreclr/jit/simd.h | 98 +----------------------------------------- 1 file changed, 1 insertion(+), 97 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index aea1072defd240..9841bdeb38c93c 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1906,7 +1906,7 @@ SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0) memcpy(&mask, &arg0.u8[0], sizeof(uint64_t)); uint32_t finalOne = count; - // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. + // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s. // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -1993,102 +1993,6 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) } } } - -template -SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) -{ - uint32_t count = sizeof(TSimd) / sizeof(TBase); - uint32_t finalOne = count; - - TBase leastSignificantBit = 1; - - // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. - // This pattern is extracted using the least significant bits of the vector elements. - - // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element has its least significant bit set - - // Find an unbroken sequence of 1s. - for (uint32_t i = 0; i < count; i++) - { - TBase input0; - memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - - bool isSet = (input0 & leastSignificantBit) != 0; - if (!isSet) - { - finalOne = i; - break; - } - } - - // Find an unbroken sequence of 0s. - for (uint32_t i = finalOne; i < count; i++) - { - TBase input0; - memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - - bool isSet = (input0 & leastSignificantBit) != 0; - if (isSet) - { - // Invalid sequence - return SveMaskPatternNone; - } - } - - if (finalOne == count) - { - return SveMaskPatternAll; - } - else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8) - { - return (SveMaskPattern)finalOne; - } - else - { - // TODO: Add other patterns as required. These probably won't be seen until we get - // to wider vector lengths. - return SveMaskPatternNone; - } -} - -template -SveMaskPattern EvaluateSimdVectorToPattern(var_types baseType, TSimd arg0) -{ - switch (baseType) - { - case TYP_FLOAT: - case TYP_INT: - case TYP_UINT: - { - return EvaluateSimdVectorToPattern(arg0); - } - - case TYP_DOUBLE: - case TYP_LONG: - case TYP_ULONG: - { - return EvaluateSimdVectorToPattern(arg0); - } - - case TYP_BYTE: - case TYP_UBYTE: - { - return EvaluateSimdVectorToPattern(arg0); - } - - case TYP_SHORT: - case TYP_USHORT: - { - return EvaluateSimdVectorToPattern(arg0); - } - - default: - { - unreached(); - } - } -} #endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS From a2d7aea5f21c560dfbc244c8539aa3900dd1be0b Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 10:17:56 +0100 Subject: [PATCH 15/28] Revert "Use LSB of vector when converting from vector to mask" This reverts commit c96e38c7ed6f2bfdce99f9d073d36b13c375be7f. --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 13 ++----------- src/coreclr/jit/lsraarm64.cpp | 6 ------ src/coreclr/jit/optimizemaskconversions.cpp | 4 ++-- 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index f109f0b2ef1d89..ca38c26ab7c845 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2010,19 +2010,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; case NI_Sve_ConvertVectorToMask: - { // PMOV would be ideal here, but it is in SVE2.1. - // - // Instead, to test if lowest bit is set, we LSL elementWidthInBits - 1 - // and then compare: CMPNE ., /Z, ., #0 - - int elementWidthInBits = 8 * genTypeSize(intrin.baseType); - regNumber ztemp = internalRegisters.GetSingle(node); - - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, ztemp, op2Reg, elementWidthInBits - 1, opt); - GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, ztemp, 0, opt); + // Instead, use a compare: CMPNE ., /Z, ., #0 + GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt); break; - } case NI_Sve_Count16BitElements: case NI_Sve_Count32BitElements: diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 3b304eec88a13e..06d832d10307ba 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1415,12 +1415,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou delayFreeOp = getDelayFreeOperand(embeddedOp, /* embedded */ true); } - if (intrin.id == NI_Sve_ConvertVectorToMask) - { - // Need an extra temp to test LSB of source vector - buildInternalFloatRegisterDefForNode(intrinsicTree); - } - // Build any immediates BuildHWIntrinsicImmediate(intrinsicTree, intrin); diff --git a/src/coreclr/jit/optimizemaskconversions.cpp b/src/coreclr/jit/optimizemaskconversions.cpp index 131a52a8e34db9..b328e884637e79 100644 --- a/src/coreclr/jit/optimizemaskconversions.cpp +++ b/src/coreclr/jit/optimizemaskconversions.cpp @@ -20,8 +20,8 @@ struct MaskConversionsWeight static constexpr const weight_t costOfConvertMaskToVector = 1.0; #if defined(TARGET_ARM64) - // Conversion of vector to mask is three instructions. - static constexpr const weight_t costOfConvertVectorToMask = 3.0; + // Conversion of vector to mask is two instructions. + static constexpr const weight_t costOfConvertVectorToMask = 2.0; #else // Conversion of vector to mask is one instructions. static constexpr const weight_t costOfConvertVectorToMask = 1.0; From c65fd38375c451130b0b6215dc1dba637997db19 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 10:20:59 +0100 Subject: [PATCH 16/28] formatting --- src/coreclr/jit/lower.h | 10 +++++----- src/coreclr/jit/lowerarmarch.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 69cb4dde4cd842..ad4c46117cbe2b 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -451,12 +451,12 @@ class Lowering final : public Phase GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) - bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); - void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); - void LowerModPow2(GenTree* node); + bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); + void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); + void LowerModPow2(GenTree* node); GenTree* LowerCnsMask(GenTreeMskCon* mask); - bool TryLowerAddForPossibleContainment(GenTreeOp* node, GenTree** next); - void StoreFFRValue(GenTreeHWIntrinsic* node); + bool TryLowerAddForPossibleContainment(GenTreeOp* node, GenTree** next); + void StoreFFRValue(GenTreeHWIntrinsic* node); #endif // !TARGET_XARCH && !TARGET_ARM64 GenTree* InsertNewSimdCreateScalarUnsafeNode(var_types type, GenTree* op1, diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index e689fe640b89ea..a78ba5b776fe33 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1182,18 +1182,19 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) unreached(); } assert(use.User()->OperIsHWIntrinsic()); - GenTreeHWIntrinsic *parent = use.User()->AsHWIntrinsic(); + GenTreeHWIntrinsic* parent = use.User()->AsHWIntrinsic(); - var_types parentBaseType = parent->GetSimdBaseType(); - CorInfoType parentSimdBaseJitType = parent->GetSimdBaseJitType(); - unsigned parentSimdSize = parent->GetSimdSize(); + var_types parentBaseType = parent->GetSimdBaseType(); + CorInfoType parentSimdBaseJitType = parent->GetSimdBaseJitType(); + unsigned parentSimdSize = parent->GetSimdSize(); assert(parent->GetSimdSize() == 16); GenTreeVecCon* vecCon = comp->gtNewVconNode(TYP_SIMD16); EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); BlockRange().InsertBefore(mask, vecCon); - GenTreeHWIntrinsic *convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); + GenTreeHWIntrinsic* convertedVec = + comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); BlockRange().InsertBefore(mask, convertedVec->Op(1)); BlockRange().InsertBefore(mask, convertedVec); @@ -1206,7 +1207,6 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) return vecCon->gtNext; } - const int POST_INDEXED_ADDRESSING_MAX_DISTANCE = 16; //------------------------------------------------------------------------ From f513c84879dfbbb53d345a168767f90ccc102b48 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Jun 2025 11:52:20 +0100 Subject: [PATCH 17/28] fix assert check Change-Id: I7951b70aec9aaef5521e100d30737b5a4d332b38 --- src/coreclr/jit/lowerarmarch.cpp | 2 +- .../JIT/opt/SVE/ConditionalSelectConstants.cs | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index a78ba5b776fe33..61b6c43ef2b8bd 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1187,7 +1187,7 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) var_types parentBaseType = parent->GetSimdBaseType(); CorInfoType parentSimdBaseJitType = parent->GetSimdBaseJitType(); unsigned parentSimdSize = parent->GetSimdSize(); - assert(parent->GetSimdSize() == 16); + assert(parentSimdSize == 16 || parentSimdSize == 0); GenTreeVecCon* vecCon = comp->gtNewVconNode(TYP_SIMD16); EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs index 65799002ace45a..78f7a31c8f2b44 100644 --- a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.cs @@ -60,6 +60,13 @@ public static int TestConditionalSelectConstants() { fail = true; } + + var r7 = ConditionalSelectConstsNoMaskPattern2(); + Console.WriteLine(r7); + if (r7 != 0) + { + fail = true; + } } if (fail) @@ -133,4 +140,11 @@ public static bool ConditionalSelectConstsNoMaskPattern() var vr5 = Vector128.CreateScalar((long)0).AsVector(); return Sve.TestFirstTrue(vr0, vr5); } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int ConditionalSelectConstsNoMaskPattern2() + { + var vr0 = Sve.ConditionalSelect(Vector128.CreateScalar((short)1).AsVector(), Vector.Create(0), Vector.Create(1)); + return Sve.ConditionalExtractAfterLastActiveElement(vr0, 1, Vector.Create(0)); + } } From 3bf4d1e2bda4f9b7933bd5b69f4d2b1b83e9aea8 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 30 Jun 2025 12:04:22 +0100 Subject: [PATCH 18/28] GenTree for gtNewSimdCvtVectorToMaskNode() --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/gentree.cpp | 8 ++++---- src/coreclr/jit/lowerarmarch.cpp | 5 ++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index c271c7b8643408..9629ca8725dbc9 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3205,7 +3205,7 @@ class Compiler unsigned simdSize); #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTreeHWIntrinsic* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); + GenTree* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewSimdCreateBroadcastNode( diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 012b660b0ccff3..81a7a7ffd1a013 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21971,10 +21971,10 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, // Return Value: // The node converted to the a mask type // -GenTreeHWIntrinsic* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize) { assert(varTypeIsMask(type)); assert(varTypeIsSIMD(op1)); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 61b6c43ef2b8bd..ff5db075b923e7 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1193,9 +1193,8 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); BlockRange().InsertBefore(mask, vecCon); - GenTreeHWIntrinsic* convertedVec = - comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); - BlockRange().InsertBefore(mask, convertedVec->Op(1)); + GenTree* convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); + BlockRange().InsertBefore(mask, convertedVec->AsHWIntrinsic()->Op(1)); BlockRange().InsertBefore(mask, convertedVec); use.ReplaceWith(convertedVec); From cd27a7cdb4d4cca1bf5fb2630fc7961098766179 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 30 Jun 2025 12:15:18 +0100 Subject: [PATCH 19/28] Split NI_Sve_ConditionalSelect into it's own case --- src/coreclr/jit/gentree.cpp | 78 ++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 81a7a7ffd1a013..aead971561c035 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32820,28 +32820,65 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case NI_Vector512_ConditionalSelect: #elif defined(TARGET_ARM64) case NI_AdvSimd_BitwiseSelect: - case NI_Sve_ConditionalSelect: #endif { assert(!varTypeIsMask(retType)); + assert(!varTypeIsMask(op1)); if (cnsNode != op1) { break; } -#if defined(TARGET_ARM64) - if (ni == NI_Sve_ConditionalSelect) + if (op1->IsVectorAllBitsSet()) { - assert(!op1->IsVectorAllBitsSet() && !op1->IsVectorZero()); + if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) + { + // op3 has side effects, this would require us to append a new statement + // to ensure that it isn't lost, which isn't safe to do from the general + // purpose handler here. We'll recognize this and mark it in VN instead + break; + } + + // op3 has no side effects, so we can return op2 directly + return op2; } - else + + if (op1->IsVectorZero()) { - assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask()); + return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT); } -#endif - if (op1->IsVectorAllBitsSet() || op1->IsTrueMask(simdBaseType)) + if (op2->IsCnsVec() && op3->IsCnsVec()) + { + // op2 = op2 & op1 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsVecCon()); + + // op3 = op2 & ~op1 + op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsVecCon()); + + // op2 = op2 | op3 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_OR, false, simdBaseType, op3->AsVecCon()); + + resultNode = op2; + } + break; + } + +#if defined(TARGET_ARM64) + case NI_Sve_ConditionalSelect: + { + assert(!varTypeIsMask(retType)); + assert(varTypeIsMask(op1)); + + if (cnsNode != op1) + { + break; + } + + assert(!op1->IsVectorAllBitsSet() && !op1->IsVectorZero()); + + if (op1->IsTrueMask(simdBaseType)) { if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) { @@ -32855,33 +32892,19 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) return op2; } - if (op1->IsVectorZero() || op1->IsFalseMask()) + if (op1->IsFalseMask()) { return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT); } if (op2->IsCnsVec() && op3->IsCnsVec()) { - if (op1->IsCnsVec()) - { - // op2 = op2 & op1 - op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsVecCon()); + // op2 = op2 & op1 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsMskCon()); - // op3 = op2 & ~op1 - op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsVecCon()); - } -#if defined(TARGET_ARM64) - else if (op1->IsCnsMsk()) - { - assert(ni == NI_Sve_ConditionalSelect); - - // op2 = op2 & op1 - op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsMskCon()); + // op3 = op2 & ~op1 + op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsMskCon()); - // op3 = op2 & ~op1 - op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsMskCon()); - } -#endif // op2 = op2 | op3 op2->AsVecCon()->EvaluateBinaryInPlace(GT_OR, false, simdBaseType, op3->AsVecCon()); @@ -32889,6 +32912,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } break; } +#endif // TARGET_ARM64 default: { From 7856b8707cda7fb290e6965d982f06b0e4d2bb02 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 30 Jun 2025 12:38:25 +0100 Subject: [PATCH 20/28] Remove mask version of EvaluateBinaryInPlace --- src/coreclr/jit/gentree.cpp | 40 +++++++++++++------------------------ src/coreclr/jit/gentree.h | 3 --- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index aead971561c035..7eadbd069396cc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18608,30 +18608,6 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type } } -#if defined(TARGET_ARM64) -//------------------------------------------------------------------------ -// GenTreeVecCon::EvaluateUnaryInPlace: Evaluates this constant using the given operation, when the other -// operand is a constant mask -// -// Arguments: -// oper - the operation to use in the evaluation -// scalar - true if this is a scalar operation; otherwise, false -// baseType - the base type of the constant being checked -// other - the mask constant to use in the evaluation -// -void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other) -{ - assert(gtType == TYP_SIMD16); - - simd16_t otherSimdVal; - EvaluateSimdCvtMaskToVector(baseType, &otherSimdVal, other->gtSimdMaskVal); - - simd16_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, gtSimd16Val, otherSimdVal); - gtSimd16Val = result; -} -#endif // TARGET_ARM64 - //------------------------------------------------------------------------ // GenTreeVecCon::EvaluateBroadcastInPlace: Evaluates this constant using a broadcast // @@ -32899,11 +32875,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) if (op2->IsCnsVec() && op3->IsCnsVec()) { + assert(op2->gtType == TYP_SIMD16); + assert(op3->gtType == TYP_SIMD16); + + simd16_t op1SimdVal; + EvaluateSimdCvtMaskToVector(simdBaseType, &op1SimdVal, op1->AsMskCon()->gtSimdMaskVal); + // op2 = op2 & op1 - op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsMskCon()); + simd16_t result = {}; + EvaluateBinarySimd(GT_AND, false, simdBaseType, &result, op2->AsVecCon()->gtSimd16Val, + op1SimdVal); + op2->AsVecCon()->gtSimd16Val = result; // op3 = op2 & ~op1 - op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsMskCon()); + result = {}; + EvaluateBinarySimd(GT_AND_NOT, false, simdBaseType, &result, op3->AsVecCon()->gtSimd16Val, + op1SimdVal); + op3->AsVecCon()->gtSimd16Val = result; // op2 = op2 | op3 op2->AsVecCon()->EvaluateBinaryInPlace(GT_OR, false, simdBaseType, op3->AsVecCon()); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 092a8cea0e7a0e..c5d49fbacfca3a 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6851,9 +6851,6 @@ struct GenTreeVecCon : public GenTree void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType); void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeVecCon* other); -#if defined(TARGET_ARM64) - void EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_types baseType, GenTreeMskCon* other); -#endif template void EvaluateBroadcastInPlace(TBase scalar) From 84d04081e9764788bf71aa247b55fe43bb924505 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 30 Jun 2025 16:41:50 +0100 Subject: [PATCH 21/28] remove assert --- src/coreclr/jit/gentree.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7eadbd069396cc..0523f453cc9b8a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32852,8 +32852,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - assert(!op1->IsVectorAllBitsSet() && !op1->IsVectorZero()); - if (op1->IsTrueMask(simdBaseType)) { if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) From ed633f316554525e28edeca8b000f05d7c95ecef Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 1 Jul 2025 10:08:10 +0100 Subject: [PATCH 22/28] Check all bits in EvaluateSimdCvtVectorToMask --- src/coreclr/jit/simd.h | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9841bdeb38c93c..f6da9993f90d45 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1598,9 +1598,8 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; - TBase significantBit = 1; #if defined(TARGET_XARCH) - significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + TBase MostSignificantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); #endif for (uint32_t i = 0; i < count; i++) @@ -1608,25 +1607,23 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & significantBit) != 0) - { #if defined(TARGET_XARCH) - // For xarch we have count sequential bits to write - // depending on if the corresponding the input element - // has its most significant bit set - + // For xarch we have count sequential bits to write depending on if the + // corresponding the input element has its most significant bit set + if ((input0 & MostSignificantBit) != 0) + { mask |= static_cast(1) << i; + } #elif defined(TARGET_ARM64) - // For Arm64 we have count total bits to write, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - + // For Arm64 we have count total bits to write, but they are sizeof(TBase) bits + // apart. We set depending on if the corresponding input element is non zero + if (input0 != 0) + { mask |= static_cast(1) << (i * sizeof(TBase)); + } #else - unreached(); + unreached(); #endif - } } memcpy(&result->u8[0], &mask, sizeof(uint64_t)); From a53e4d1e81360b02c82c2fea89ccd7a6fc7648e8 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 1 Jul 2025 16:02:10 +0100 Subject: [PATCH 23/28] Add ConstantVectors test --- src/tests/JIT/opt/SVE/ConstantVectors.cs | 43 ++++++++++++++++++++ src/tests/JIT/opt/SVE/ConstantVectors.csproj | 12 ++++++ 2 files changed, 55 insertions(+) create mode 100644 src/tests/JIT/opt/SVE/ConstantVectors.cs create mode 100644 src/tests/JIT/opt/SVE/ConstantVectors.csproj diff --git a/src/tests/JIT/opt/SVE/ConstantVectors.cs b/src/tests/JIT/opt/SVE/ConstantVectors.cs new file mode 100644 index 00000000000000..4a0f269bcc0f58 --- /dev/null +++ b/src/tests/JIT/opt/SVE/ConstantVectors.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using Xunit; + +public class ConstanVectors +{ + [MethodImpl(MethodImplOptions.NoInlining)] + [Fact] + public static int TestConstanVectors() + { + bool fail = false; + + if (Sve.IsSupported) + { + var r1 = SaturatingDecrementByActiveElementCountConst(); + Console.WriteLine(r1); + if (r1 != 18446744073709551615) + { + fail = true; + } + } + + if (fail) + { + return 101; + } + return 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ulong SaturatingDecrementByActiveElementCountConst() + { + var vr5 = Vector128.CreateScalar(14610804860246336108UL).AsVector(); + return Sve.SaturatingDecrementByActiveElementCount(0UL, vr5); + } +} diff --git a/src/tests/JIT/opt/SVE/ConstantVectors.csproj b/src/tests/JIT/opt/SVE/ConstantVectors.csproj new file mode 100644 index 00000000000000..06e13807788085 --- /dev/null +++ b/src/tests/JIT/opt/SVE/ConstantVectors.csproj @@ -0,0 +1,12 @@ + + + true + True + + + + + + + + From cd52ec162fa0449a488b5a78f74b2ba3cb740ef5 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Jul 2025 09:45:41 +0100 Subject: [PATCH 24/28] No need for DOTNET_EnableHWIntrinsic in csproj --- src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj | 1 - src/tests/JIT/opt/SVE/ConstantVectors.csproj | 1 - 2 files changed, 2 deletions(-) diff --git a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj index 06e13807788085..610299ffff1175 100644 --- a/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj +++ b/src/tests/JIT/opt/SVE/ConditionalSelectConstants.csproj @@ -7,6 +7,5 @@ - diff --git a/src/tests/JIT/opt/SVE/ConstantVectors.csproj b/src/tests/JIT/opt/SVE/ConstantVectors.csproj index 06e13807788085..610299ffff1175 100644 --- a/src/tests/JIT/opt/SVE/ConstantVectors.csproj +++ b/src/tests/JIT/opt/SVE/ConstantVectors.csproj @@ -7,6 +7,5 @@ - From 748d29720456bbcad2a8be2f94ea7580246b6e66 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Jul 2025 10:05:26 +0100 Subject: [PATCH 25/28] Use IsMaskZero --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e56c3a1c6a48a2..78ed00c13041cf 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33595,7 +33595,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) return op2; } - if (op1->IsFalseMask()) + if (op1->IsMaskZero()) { return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT); } From 090523a3a2332821f095c03e2089038b534ae975 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Jul 2025 15:36:07 +0100 Subject: [PATCH 26/28] Remove EvaluateBinarySimdAndMask --- src/coreclr/jit/valuenum.cpp | 40 +++++++++++++----------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 6a5c3e3865f1e4..52ac0df5d4cb71 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7868,28 +7868,6 @@ ValueNum EvaluateSimdCvtVectorToMask(ValueNumStore* vns, var_types simdType, var return vns->VNForSimdMaskCon(result); } -#if defined(TARGET_ARM64) -ValueNum EvaluateBinarySimdAndMask(ValueNumStore* vns, - genTreeOps oper, - bool scalar, - var_types simdType, - var_types baseType, - ValueNum arg0VN, - ValueNum arg1VNMask) -{ - assert(simdType == TYP_SIMD16); - - simd16_t arg0 = GetConstantSimd16(vns, baseType, arg0VN); - - ValueNum arg1VNSimd = EvaluateSimdCvtMaskToVector(vns, simdType, baseType, arg1VNMask); - simd16_t arg1 = GetConstantSimd16(vns, baseType, arg1VNSimd); - - simd16_t result = {}; - EvaluateBinarySimd(oper, scalar, baseType, &result, arg0, arg1); - return vns->VNForSimd16Con(result); -} -#endif // TARGET_ARM64 - ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, VNFunc func, ValueNum arg0VN, @@ -9171,11 +9149,21 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary( if (ni == NI_Sve_ConditionalSelect) { assert(TypeOfVN(arg0VN) == TYP_MASK); + assert(type == TYP_SIMD16); + + ValueNum maskVNSimd = EvaluateSimdCvtMaskToVector(this, type, baseType, arg0VN); + simd16_t maskVal = ::GetConstantSimd16(this, baseType, maskVNSimd); + + simd16_t arg1 = ::GetConstantSimd16(this, baseType, arg1VN); + simd16_t arg2 = ::GetConstantSimd16(this, baseType, arg2VN); + + simd16_t result = {}; + EvaluateBinarySimd(GT_AND, false, baseType, &result, arg1, maskVal); + ValueNum trueVN = VNForSimd16Con(result); - ValueNum trueVN = - EvaluateBinarySimdAndMask(this, GT_AND, false, type, baseType, arg1VN, arg0VN); - ValueNum falseVN = - EvaluateBinarySimdAndMask(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); + result = {}; + EvaluateBinarySimd(GT_AND_NOT, false, baseType, &result, arg2, maskVal); + ValueNum falseVN = VNForSimd16Con(result); return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); } From e7034bdaa519b855454cfde749721ef6484aabe9 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Jul 2025 15:56:31 +0100 Subject: [PATCH 27/28] In lowering, default the mask type to byte --- src/coreclr/jit/lowerarmarch.cpp | 34 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index e8eb56a24ea14e..650ac850dd5599 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1176,28 +1176,44 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) LABELEDDISPTREERANGE("lowering cns mask to cns vector (before)", BlockRange(), mask); + // Default values in case there is no user or the user is not a HWIntrinsic + var_types parentBaseType = TYP_BYTE; + CorInfoType parentSimdBaseJitType = CORINFO_TYPE_BYTE; + unsigned parentSimdSize = 16; + assert(parentSimdSize == 16 || parentSimdSize == 0); + LIR::Use use; - if (!BlockRange().TryGetUse(mask, &use)) + bool foundUse = BlockRange().TryGetUse(mask, &use); + + if (use.User()->OperIsHWIntrinsic()) { - unreached(); - } - assert(use.User()->OperIsHWIntrinsic()); - GenTreeHWIntrinsic* parent = use.User()->AsHWIntrinsic(); + GenTreeHWIntrinsic* parent = use.User()->AsHWIntrinsic(); - var_types parentBaseType = parent->GetSimdBaseType(); - CorInfoType parentSimdBaseJitType = parent->GetSimdBaseJitType(); - unsigned parentSimdSize = parent->GetSimdSize(); + parentBaseType = parent->GetSimdBaseType(); + parentSimdBaseJitType = parent->GetSimdBaseJitType(); + parentSimdSize = parent->GetSimdSize(); + } assert(parentSimdSize == 16 || parentSimdSize == 0); + // Create a vector constant GenTreeVecCon* vecCon = comp->gtNewVconNode(TYP_SIMD16); EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); BlockRange().InsertBefore(mask, vecCon); + // Convert the vector constant to a mask GenTree* convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); BlockRange().InsertBefore(mask, convertedVec->AsHWIntrinsic()->Op(1)); BlockRange().InsertBefore(mask, convertedVec); - use.ReplaceWith(convertedVec); + // Update use + if (foundUse) + { + use.ReplaceWith(convertedVec); + } + else + { + convertedVec->SetUnusedValue(); + } BlockRange().Remove(mask); From f14fc8e8dfac6bff07b5b1d5bf614ff597c9b922 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Jul 2025 17:44:12 +0100 Subject: [PATCH 28/28] In lowering, convert mask using byte basetype --- src/coreclr/jit/lowerarmarch.cpp | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 650ac850dd5599..7a3d1eeb57e12d 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1172,41 +1172,24 @@ GenTree* Lowering::LowerCnsMask(GenTreeMskCon* mask) // Not a valid pattern, so cannot be created using ptrue/pfalse. Instead the mask will require // loading from memory. There is no way to load to a predicate from memory using a PC relative - // address, so instead use a constant vector plus conversion to mask. + // address, so instead use a constant vector plus conversion to mask. Using basetype byte will + // ensure every entry in the mask is converted. LABELEDDISPTREERANGE("lowering cns mask to cns vector (before)", BlockRange(), mask); - // Default values in case there is no user or the user is not a HWIntrinsic - var_types parentBaseType = TYP_BYTE; - CorInfoType parentSimdBaseJitType = CORINFO_TYPE_BYTE; - unsigned parentSimdSize = 16; - assert(parentSimdSize == 16 || parentSimdSize == 0); - - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(mask, &use); - - if (use.User()->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* parent = use.User()->AsHWIntrinsic(); - - parentBaseType = parent->GetSimdBaseType(); - parentSimdBaseJitType = parent->GetSimdBaseJitType(); - parentSimdSize = parent->GetSimdSize(); - } - assert(parentSimdSize == 16 || parentSimdSize == 0); - // Create a vector constant GenTreeVecCon* vecCon = comp->gtNewVconNode(TYP_SIMD16); - EvaluateSimdCvtMaskToVector(parentBaseType, &vecCon->gtSimdVal, mask->gtSimdMaskVal); + EvaluateSimdCvtMaskToVector(TYP_BYTE, &vecCon->gtSimdVal, mask->gtSimdMaskVal); BlockRange().InsertBefore(mask, vecCon); // Convert the vector constant to a mask - GenTree* convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, parentSimdBaseJitType, parentSimdSize); + GenTree* convertedVec = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, vecCon, CORINFO_TYPE_BYTE, 16); BlockRange().InsertBefore(mask, convertedVec->AsHWIntrinsic()->Op(1)); BlockRange().InsertBefore(mask, convertedVec); // Update use - if (foundUse) + LIR::Use use; + if (BlockRange().TryGetUse(mask, &use)) { use.ReplaceWith(convertedVec); }