diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9629ca8725dbc9..e5d1db90f2a271 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3205,7 +3205,7 @@ class Compiler unsigned simdSize); #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTree* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); + GenTreeHWIntrinsic* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewSimdCreateBroadcastNode( @@ -6713,8 +6713,7 @@ class Compiler GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node); #endif // FEATURE_MASKED_HW_INTRINSICS #ifdef TARGET_ARM64 - bool canMorphVectorOperandToMask(GenTree* node); - bool canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node); + void getVectorMaskVariantWeights(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost); GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent); GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node); #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 75ddbddb0fcf24..18c19bc54a382d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21947,10 +21947,10 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, // Return Value: // The node converted to the a mask type // -GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTreeHWIntrinsic* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize) { assert(varTypeIsMask(type)); assert(varTypeIsSIMD(op1)); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 1b15e34a7d8bf6..af7b4631f667c3 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9727,10 +9727,7 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n return node; } #elif defined(TARGET_ARM64) - // TODO-SVE: This optimisation is too naive. It needs to calculate the full cost of the instruction - // vs using the predicate version, taking into account all input arguements and all uses - // of the result. - // return fgMorphTryUseAllMaskVariant(node); + return fgMorphTryUseAllMaskVariant(node); #else #error Unsupported platform #endif @@ -9738,37 +9735,69 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n } #ifdef TARGET_ARM64 -//------------------------------------------------------------------------ -// canMorphVectorOperandToMask: Can this vector operand be converted to a -// node with type TYP_MASK easily? -// -bool Compiler::canMorphVectorOperandToMask(GenTree* node) -{ - return varTypeIsMask(node) || node->OperIsConvertMaskToVector() || node->IsVectorZero(); -} + +// Conversion of mask to vector is one instruction (a mov) +static constexpr const weight_t costOfConvertMaskToVector = 1.0; + +// Conversion of vector to mask is two instructions (a compare and a ptrue) +static constexpr const weight_t costOfConvertVectorToMask = 2.0; //------------------------------------------------------------------------ -// canMorphAllVectorOperandsToMasks: Can all vector operands to this node -// be converted to a node with type -// TYP_MASK easily? +// getVectorMaskVariantWeights: Calculate the cost of using the vector +// and mask variants // -bool Compiler::canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node) +// Arguments: +// node - The node to convert to a mask +// parent - The parent of the node +// vectorVariantCost - (IN/OUT) incremented by the cost of using the vector variant +// maskVariantCost - (IN/OUT) incremented by the cost of using the mask variant +// +void Compiler::getVectorMaskVariantWeights(GenTree* node, + GenTreeHWIntrinsic* parent, + weight_t* vectorVariantCost, + weight_t* maskVariantCost) { - bool allMaskConversions = true; - for (size_t i = 1; i <= node->GetOperandCount() && allMaskConversions; i++) + if (varTypeIsMask(node)) { - allMaskConversions &= canMorphVectorOperandToMask(node->Op(i)); + // Node is already a mask + return; + } + + if (node->OperIsConvertMaskToVector()) + { + // Currently there is the cost of the convert + *vectorVariantCost += costOfConvertMaskToVector; + return; + } + + if (node->IsCnsVec()) + { + // If the constant vector can be converted to a mask pattern during codegen, then the morph is free. + // Otherwise it will cost one vector-to-mask conversion. + + assert(node->TypeGet() == TYP_SIMD16); + var_types simdBaseType = parent->GetSimdBaseType(); + + if (node->IsVectorZero() || + SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + { + return; + } } - return allMaskConversions; + // mask variant will require wrapping in a convert vector to mask + *maskVariantCost += costOfConvertVectorToMask; } //------------------------------------------------------------------------ -// doMorphVectorOperandToMask: Morph a vector node that is close to a mask -// node into a mask node. +// doMorphVectorOperandToMask: Morph a vector node into a mask node. +// +// Arguments: +// node - The node to convert to a mask +// parent - The parent of the node // // Return value: -// The morphed tree, or nullptr if the transform is not applicable. +// The morphed tree // GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent) { @@ -9777,76 +9806,149 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* // Already a mask, nothing to do. return node; } - else if (node->OperIsConvertMaskToVector()) + + if (node->OperIsConvertMaskToVector()) { // Replace node with op1. return node->AsHWIntrinsic()->Op(1); } - else if (node->IsVectorZero()) + + if (node->IsCnsVec()) { - // Morph the vector of zeroes into mask of zeroes. - GenTree* mask = gtNewSimdFalseMaskByteNode(); - mask->SetMorphed(this); - return mask; + // Morph the constant vector to mask + assert(node->TypeGet() == TYP_SIMD16); + var_types simdBaseType = parent->GetSimdBaseType(); + + if (node->IsVectorZero()) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + mskCon->gtSimdMaskVal = simdmask_t::Zero(); + mskCon->SetMorphed(this); + return mskCon; + } + else if (SveMaskPatternNone != + EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, node->AsVecCon()->gtSimd16Val); + mskCon->SetMorphed(this); + return mskCon; + } } - return nullptr; + // Wrap in a convert vector to mask + CorInfoType simdBaseJitType = parent->GetSimdBaseJitType(); + unsigned simdSize = parent->GetSimdSize(); + GenTreeHWIntrinsic* cvt = gtNewSimdCvtVectorToMaskNode(TYP_MASK, node, simdBaseJitType, simdSize); + cvt->SetMorphed(this); + cvt->Op(1)->SetMorphed(this); + return cvt; } //----------------------------------------------------------------------------------------------------- // fgMorphTryUseAllMaskVariant: For NamedIntrinsics that have a variant where all operands are -// mask nodes. If all operands to this node are 'suggesting' that they -// originate closely from a mask, but are of vector types, then morph the -// operands as appropriate to use mask types instead. 'Suggesting' -// is defined by the canMorphVectorOperandToMask function. +// mask nodes. If the NamedIntrinsic is used by converting it to a mask +// then it can be replaced with the mask variant. Use simple costing to +// decide if the variant is preferred. +// +// Notes: +// It is not safe to convert to a mask and then back to vector again as information will be lost. +// Therefore only consider NamedIntrinsics that operate on masks. // // Arguments: -// tree - The HWIntrinsic to try and optimize. +// node - The conversion to mask of the HWIntrinsic to try and optimize. // // Return Value: // The fully morphed tree if a change was made, else nullptr. // GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node) { - if (HWIntrinsicInfo::HasAllMaskVariant(node->GetHWIntrinsicId())) + // We only care about vectors being used as masks. + if (!node->OperIsConvertVectorToMask()) { - NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(node->GetHWIntrinsicId()); + return nullptr; + } - // As some intrinsics have many variants, check that the count of operands on the node - // matches the number of operands required for the mask variant of the intrinsic. The mask - // variant of the intrinsic must have a fixed number of operands. - int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant); - assert(numArgs >= 0); - if (node->GetOperandCount() == (size_t)numArgs) - { - // We're sure it will work at this point, so perform the pattern match on operands. - if (canMorphAllVectorOperandsToMasks(node)) - { - switch (node->GetOperandCount()) - { - case 1: - node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node)); - break; - case 2: - node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node), - doMorphVectorOperandToMask(node->Op(2), node)); - break; - case 3: - node->ResetHWIntrinsicId(maskVariant, this, doMorphVectorOperandToMask(node->Op(1), node), - doMorphVectorOperandToMask(node->Op(2), node), - doMorphVectorOperandToMask(node->Op(3), node)); - break; - default: - unreached(); - } + // Check the converted node. Is there a masked variant? - node->gtType = TYP_MASK; - return node; - } - } + if (!node->Op(2)->OperIsHWIntrinsic()) + { + return nullptr; } - return nullptr; + GenTreeHWIntrinsic* convertedNode = node->Op(2)->AsHWIntrinsic(); + + if (!HWIntrinsicInfo::HasAllMaskVariant(convertedNode->GetHWIntrinsicId())) + { + return nullptr; + } + + NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(convertedNode->GetHWIntrinsicId()); + + // As some intrinsics have many variants, check that the count of operands on the node + // matches the number of operands required for the mask variant of the intrinsic. The mask + // variant of the intrinsic must have a fixed number of operands. + int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant); + assert(numArgs >= 0); + if (convertedNode->GetOperandCount() != (size_t)numArgs) + { + return nullptr; + } + + weight_t vectorVariantCost = 0.0; + weight_t maskVariantCost = 0.0; + + // Take into account the cost of the conversion of the result + vectorVariantCost += costOfConvertVectorToMask; + + // Take into account the cost of the children + for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) + { + getVectorMaskVariantWeights(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost); + } + + JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n", + dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, + (maskVariantCost > vectorVariantCost) ? "not " : ""); + + // If the costs are identical, then prefer morphed as it matches the output type. + if (maskVariantCost > vectorVariantCost) + { + return nullptr; + } + + // Do the morph + switch (convertedNode->GetOperandCount()) + { + case 1: + convertedNode->ResetHWIntrinsicId(maskVariant, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode)); + + break; + case 2: + convertedNode->ResetHWIntrinsicId(maskVariant, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode)); + break; + case 3: + convertedNode->ResetHWIntrinsicId(maskVariant, this, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(3), convertedNode)); + break; + default: + unreached(); + } + + // Morph the new children. + for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) + { + convertedNode->Op(i) = fgMorphTree(convertedNode->Op(i)); + } + + convertedNode->gtType = TYP_MASK; + DEBUG_DESTROY_NODE(node); + return convertedNode; } #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9841bdeb38c93c..a3e5b754f1fcea 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1598,9 +1598,8 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; - TBase significantBit = 1; #if defined(TARGET_XARCH) - significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + TBase significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); #endif for (uint32_t i = 0; i < count; i++) @@ -1608,25 +1607,24 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); +#if defined(TARGET_XARCH) + // For xarch we have count sequential bits to write depending on if the + // corresponding the input element has its most significant bit set if ((input0 & significantBit) != 0) { -#if defined(TARGET_XARCH) - // For xarch we have count sequential bits to write - // depending on if the corresponding the input element - // has its most significant bit set - mask |= static_cast(1) << i; + } #elif defined(TARGET_ARM64) - // For Arm64 we have count total bits to write, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - + // For Arm64 we have count total bits to write, but they are sizeof(TBase) + // bits apart. We set depending on if the corresponding input element has + // any bit set (this matches the use of cmpne in outputted assembly). + if (input0 != 0) + { mask |= static_cast(1) << (i * sizeof(TBase)); + } #else - unreached(); + unreached(); #endif - } } memcpy(&result->u8[0], &mask, sizeof(uint64_t)); @@ -1906,7 +1904,7 @@ SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0) memcpy(&mask, &arg0.u8[0], sizeof(uint64_t)); uint32_t finalOne = count; - // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s. + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -1993,6 +1991,101 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) } } } + +template +SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) +{ + uint32_t count = sizeof(TSimd) / sizeof(TBase); + uint32_t finalOne = count; + + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. + // This pattern is extracted using the least significant bits of the vector elements. + + // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element has any bit set (this matches the use + // of cmpne in outputted assembly) + + // Find an unbroken sequence of 1s. + for (uint32_t i = 0; i < count; i++) + { + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); + + bool isSet = input0 != 0; + if (!isSet) + { + finalOne = i; + break; + } + } + + // Find an unbroken sequence of 0s. + for (uint32_t i = finalOne; i < count; i++) + { + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); + + bool isSet = input0 != 0; + if (isSet) + { + // Invalid sequence + return SveMaskPatternNone; + } + } + + if (finalOne == count) + { + return SveMaskPatternAll; + } + else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8) + { + return (SveMaskPattern)finalOne; + } + else + { + // TODO: Add other patterns as required. These probably won't be seen until we get + // to wider vector lengths. + return SveMaskPatternNone; + } +} + +template +SveMaskPattern EvaluateSimdVectorToPattern(var_types baseType, TSimd arg0) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + default: + { + unreached(); + } + } +} #endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index b1336674f1638b..c5a1e75199c4a9 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -17,66 +17,98 @@ public static void TestPredicateInstructions() { if (Sve.IsSupported) { + Vector vecs = Vector.Create(2); + Vector veci = Vector.Create(3); + Vector vecui = Vector.Create(5); + Vector vecl = Vector.Create(7); + ZipLow(); - ZipHigh(); + ZipHigh(vecui); UnzipOdd(); UnzipEven(); TransposeOdd(); - TransposeEven(); + TransposeEven(vecl); ReverseElement(); And(); BitwiseClear(); Xor(); Or(); - ConditionalSelect(); + ConditionalSelect(veci); + + ZipLowMask(); + ZipHighMask(vecui); + UnzipOddMask(); + UnzipEvenMask(); + TransposeOddMask(); + TransposeEvenMask(vecl); + ReverseElementMask(); + AndMask(); + BitwiseClearMask(); + XorMask(); + OrMask(); + ConditionalSelectMask(veci); + + UnzipEvenZipLowMask(); + TransposeEvenAndMask(vecs); + } } + // These should not use the predicate variants + [MethodImpl(MethodImplOptions.NoInlining)] static Vector ZipLow() { + //ARM64-FULL-LINE: zip1 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector ZipHigh() + static Vector ZipHigh(Vector v) { - return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32()); + //ARM64-FULL-LINE: zip2 {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipEven() { + //ARM64-FULL-LINE: uzp1 {{z[0-9]+}}.b, {{z[0-9]+}}.b, {{z[0-9]+}}.b return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipOdd() { + //ARM64-FULL-LINE: uzp2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector TransposeEven() + static Vector TransposeEven(Vector v) { - return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64()); + //ARM64-FULL-LINE: trn1 {{z[0-9]+}}.d, {{z[0-9]+}}.d, {{z[0-9]+}}.d + return Sve.TransposeEven(v, Sve.CreateTrueMaskInt64()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector TransposeOdd() { + //ARM64-FULL-LINE: trn2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector ReverseElement() { + //ARM64-FULL-LINE: rev {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ReverseElement(Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector And() { + //ARM64-FULL-LINE: and {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( Sve.CreateTrueMaskInt16(), Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), @@ -87,8 +119,9 @@ static Vector And() [MethodImpl(MethodImplOptions.NoInlining)] static Vector BitwiseClear() { + //ARM64-FULL-LINE: bic {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( - Sve.CreateFalseMaskInt16(), + Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1), Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), Vector.Zero ); @@ -97,8 +130,9 @@ static Vector BitwiseClear() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Xor() { + //ARM64-FULL-LINE: eor {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s return Sve.ConditionalSelect( - Sve.CreateTrueMaskInt32(), + Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount3), Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), Vector.Zero ); @@ -107,20 +141,156 @@ static Vector Xor() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Or() { + //ARM64-FULL-LINE: orr {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( - Sve.CreateTrueMaskInt16(), + Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1), Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), Vector.Zero ); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector ConditionalSelect() + static Vector ConditionalSelect(Vector v) { + // Use a passed in vector for the mask to prevent optimising away the select + //ARM64-FULL-LINE: sel {{z[0-9]+}}.s, {{p[0-9]+}}, {{z[0-9]+}}.s, {{z[0-9]+}}.s return Sve.ConditionalSelect( - Vector.Zero, + v, Sve.CreateFalseMaskInt32(), Sve.CreateTrueMaskInt32() ); } -} \ No newline at end of file + + + // These should use the predicate variants. + // CreateBreakAfterMask is used as the first argument is a predicate. + + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipLowMask() + { + //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipHighMask(Vector v) + { + //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s + return Sve.CreateBreakAfterMask(Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v), Sve.CreateTrueMaskUInt32()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipEvenMask() + { + //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero), Sve.CreateTrueMaskSByte()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipOddMask() + { + //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeEvenMask(Vector v) + { + //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d + return Sve.CreateBreakAfterMask(Sve.TransposeEven(v, Sve.CreateTrueMaskInt64()), Sve.CreateFalseMaskInt64()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeOddMask() + { + //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ReverseElementMask() + { + //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.ReverseElement(Sve.CreateTrueMaskInt16()), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector AndMask() + { + //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector BitwiseClearMask() + { + //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector XorMask() + { + //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(), + Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), + Vector.Zero + ), Sve.CreateFalseMaskInt32()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector OrMask() + { + //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelectMask(Vector v) + { + // Use a passed in vector for the mask to prevent optimising away the select + //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + v, + Sve.CreateFalseMaskInt32(), + Sve.CreateTrueMaskInt32() + ), Sve.CreateFalseMaskInt32()); + } + + // These have multiple uses of the predicate variants + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipEvenZipLowMask() + { + //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()), Vector.Zero), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeEvenAndMask(Vector v) + { + //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.TransposeEven(Sve.CreateTrueMaskInt16(), Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.And(v, Sve.CreateTrueMaskInt16()), + Vector.Zero + )), Sve.CreateFalseMaskInt16()); + + } +}