Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d3ba163

Browse files
committedJul 7, 2024·
Decompose some bitwise operations in HIR to allow more overall optimizations to kick in
1 parent 4addcaa commit d3ba163

11 files changed

+380
-237
lines changed
 

‎src/coreclr/jit/gentree.cpp

+35-58
Original file line numberDiff line numberDiff line change
@@ -20861,12 +20861,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2086120861
}
2086220862
}
2086320863
}
20864-
20865-
if (op == GT_AND_NOT)
20866-
{
20867-
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20868-
needsReverseOps = true;
20869-
}
2087020864
break;
2087120865
}
2087220866
#endif // TARGET_XARCH
@@ -20897,11 +20891,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2089720891

2089820892
if (intrinsic != NI_Illegal)
2089920893
{
20894+
if (op == GT_AND_NOT)
20895+
{
20896+
assert(fgNodeThreading == NodeThreading::LIR);
20897+
20898+
#if defined(TARGET_XARCH)
20899+
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20900+
// We specially handle this here since we're only producing a
20901+
// native intrinsic node in LIR
20902+
20903+
std::swap(op1, op2);
20904+
#endif // TARGET_XARCH
20905+
}
2090020906
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
2090120907
}
2090220908

2090320909
switch (op)
2090420910
{
20911+
case GT_AND_NOT:
20912+
{
20913+
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
20914+
// appropriately optimize around the individual operations being performed, particularly ~op2,
20915+
// and produce overall better codegen.
20916+
assert(fgNodeThreading != NodeThreading::LIR);
20917+
20918+
op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
20919+
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
20920+
}
20921+
2090520922
#if defined(TARGET_XARCH)
2090620923
case GT_RSZ:
2090720924
{
@@ -21066,9 +21083,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2106621083
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
2106721084
}
2106821085

21069-
// Validate we can't use AVX512F_VL_TernaryLogic here
21070-
assert(!canUseEvexEncodingDebugOnly());
21071-
2107221086
// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
2107321087
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
2107421088
widenedSimdBaseJitType, widenedSimdSize);
@@ -22033,9 +22047,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
2203322047
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
2203422048
CORINFO_TYPE_INT, simdSize);
2203522049

22036-
// Validate we can't use AVX512F_VL_TernaryLogic here
22037-
assert(!canUseEvexEncodingDebugOnly());
22038-
2203922050
op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
2204022051
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
2204122052
}
@@ -24146,9 +24157,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2414624157

2414724158
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2414824159

24149-
// Validate we can't use AVX512F_VL_TernaryLogic here
24150-
assert(!canUseEvexEncodingDebugOnly());
24151-
2415224160
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2415324161
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2415424162
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
@@ -24187,9 +24195,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2418724195

2418824196
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2418924197

24190-
// Validate we can't use AVX512F_VL_TernaryLogic here
24191-
assert(!canUseEvexEncodingDebugOnly());
24192-
2419324198
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2419424199
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2419524200
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
@@ -24291,9 +24296,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2429124296

2429224297
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2429324298

24294-
// Validate we can't use AVX512F_VL_TernaryLogic here
24295-
assert(!canUseEvexEncodingDebugOnly());
24296-
2429724299
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2429824300
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2429924301

@@ -24330,9 +24332,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2433024332

2433124333
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2433224334

24333-
// Validate we can't use AVX512F_VL_TernaryLogic here
24334-
assert(!canUseEvexEncodingDebugOnly());
24335-
2433624335
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2433724336
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2433824337

@@ -27821,6 +27820,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
2782127820
assert(!isScalar);
2782227821
assert(op2->TypeIs(simdType));
2782327822

27823+
if (comp->fgNodeThreading != NodeThreading::LIR)
27824+
{
27825+
// We don't want to support creating AND_NOT nodes prior to LIR
27826+
// as it can break important optimizations. We'll produces this
27827+
// in lowering instead.
27828+
break;
27829+
}
27830+
2782427831
#if defined(TARGET_XARCH)
2782527832
if (simdSize == 64)
2782627833
{
@@ -30155,13 +30162,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3015530162
bool isScalar = false;
3015630163
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
3015730164

30158-
#if defined(TARGET_XARCH)
30159-
if (oper == GT_AND_NOT)
30160-
{
30161-
// xarch does: ~op1 & op2, we need op1 & ~op2
30162-
std::swap(op1, op2);
30163-
}
30164-
#endif // TARGET_XARCH
30165+
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
30166+
assert(oper != GT_AND_NOT);
3016530167

3016630168
GenTree* cnsNode = nullptr;
3016730169
GenTree* otherNode = nullptr;
@@ -30674,31 +30676,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3067430676
break;
3067530677
}
3067630678

30677-
case GT_AND_NOT:
30678-
{
30679-
// Handle `x & ~0 == x` and `0 & ~x == 0`
30680-
if (cnsNode->IsVectorZero())
30681-
{
30682-
if (cnsNode == op1)
30683-
{
30684-
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30685-
break;
30686-
}
30687-
else
30688-
{
30689-
resultNode = otherNode;
30690-
}
30691-
break;
30692-
}
30693-
30694-
// Handle `x & ~AllBitsSet == 0`
30695-
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
30696-
{
30697-
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30698-
}
30699-
break;
30700-
}
30701-
3070230679
case GT_DIV:
3070330680
{
3070430681
if (varTypeIsFloating(simdBaseType))
@@ -31089,12 +31066,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3108931066
{
3109031067
switch (ni)
3109131068
{
31092-
case NI_Vector128_ConditionalSelect:
3109331069
#if defined(TARGET_XARCH)
31070+
case NI_Vector128_ConditionalSelect:
3109431071
case NI_Vector256_ConditionalSelect:
3109531072
case NI_Vector512_ConditionalSelect:
3109631073
#elif defined(TARGET_ARM64)
31097-
case NI_Vector64_ConditionalSelect:
31074+
case NI_AdvSimd_BitwiseSelect:
3109831075
case NI_Sve_ConditionalSelect:
3109931076
#endif
3110031077
{

‎src/coreclr/jit/hwintrinsicarm64.cpp

+34
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,40 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
611611
break;
612612
}
613613

614+
case NI_AdvSimd_BitwiseClear:
615+
{
616+
assert(sig->numArgs == 2);
617+
618+
// We don't want to support creating AND_NOT nodes prior to LIR
619+
// as it can break important optimizations. We'll produces this
620+
// in lowering instead so decompose into the individual operations
621+
// on import
622+
623+
op2 = impSIMDPopStack();
624+
op1 = impSIMDPopStack();
625+
626+
op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize);
627+
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
628+
break;
629+
}
630+
631+
case NI_AdvSimd_OrNot:
632+
{
633+
assert(sig->numArgs == 2);
634+
635+
// We don't want to support creating OR_NOT nodes prior to LIR
636+
// as it can break important optimizations. We'll produces this
637+
// in lowering instead so decompose into the individual operations
638+
// on import
639+
640+
op2 = impSIMDPopStack();
641+
op1 = impSIMDPopStack();
642+
643+
op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize);
644+
retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize);
645+
break;
646+
}
647+
614648
case NI_Vector64_AndNot:
615649
case NI_Vector128_AndNot:
616650
{

‎src/coreclr/jit/hwintrinsiccodegenxarch.cpp

+40
Original file line numberDiff line numberDiff line change
@@ -2856,6 +2856,46 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption
28562856
break;
28572857
}
28582858

2859+
case NI_EVEX_XnorMask:
2860+
{
2861+
assert(instOptions == INS_OPTS_NONE);
2862+
2863+
uint32_t simdSize = node->GetSimdSize();
2864+
uint32_t count = simdSize / genTypeSize(baseType);
2865+
2866+
if (count <= 8)
2867+
{
2868+
assert((count == 2) || (count == 4) || (count == 8));
2869+
ins = INS_kxnorb;
2870+
}
2871+
else if (count == 16)
2872+
{
2873+
ins = INS_kxnorw;
2874+
}
2875+
else if (count == 32)
2876+
{
2877+
ins = INS_kxnord;
2878+
}
2879+
else
2880+
{
2881+
assert(count == 64);
2882+
ins = INS_kxnorq;
2883+
}
2884+
2885+
op1Reg = op1->GetRegNum();
2886+
2887+
GenTree* op2 = node->Op(2);
2888+
regNumber op2Reg = op2->GetRegNum();
2889+
2890+
assert(emitter::isMaskReg(targetReg));
2891+
assert(emitter::isMaskReg(op1Reg));
2892+
assert(emitter::isMaskReg(op2Reg));
2893+
2894+
// Use EA_32BYTE to ensure the VEX.L bit gets set
2895+
emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg);
2896+
break;
2897+
}
2898+
28592899
case NI_AVX512F_ConvertToInt32:
28602900
case NI_AVX512F_ConvertToUInt32:
28612901
case NI_AVX512F_ConvertToUInt32WithTruncation:

‎src/coreclr/jit/hwintrinsiclistarm64.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ HARDWARE_INTRINSIC(AdvSimd, AddScalar,
245245
HARDWARE_INTRINSIC(AdvSimd, AddWideningLower, 8, 2, true, {INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddw, INS_uaddw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
246246
HARDWARE_INTRINSIC(AdvSimd, AddWideningUpper, 16, 2, true, {INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddw2, INS_uaddw2, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
247247
HARDWARE_INTRINSIC(AdvSimd, And, -1, 2, true, {INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and}, HW_Category_SIMD, HW_Flag_Commutative)
248-
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_NoFlag)
248+
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_SpecialImport)
249249
HARDWARE_INTRINSIC(AdvSimd, BitwiseSelect, -1, 3, true, {INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl}, HW_Category_SIMD, HW_Flag_SpecialCodeGen)
250250
HARDWARE_INTRINSIC(AdvSimd, Ceiling, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_invalid}, HW_Category_SIMD, HW_Flag_NoFlag)
251251
HARDWARE_INTRINSIC(AdvSimd, CeilingScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_frintp}, HW_Category_SIMD, HW_Flag_SIMDScalar)
@@ -383,7 +383,7 @@ HARDWARE_INTRINSIC(AdvSimd, NegateSaturate,
383383
HARDWARE_INTRINSIC(AdvSimd, NegateScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fneg, INS_fneg}, HW_Category_SIMD, HW_Flag_SIMDScalar)
384384
HARDWARE_INTRINSIC(AdvSimd, Not, -1, 1, true, {INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn}, HW_Category_SIMD, HW_Flag_NoFlag)
385385
HARDWARE_INTRINSIC(AdvSimd, Or, -1, 2, true, {INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr}, HW_Category_SIMD, HW_Flag_Commutative)
386-
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_NoFlag)
386+
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_SpecialImport)
387387
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiply, -1, 2, true, {INS_pmul, INS_pmul, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative)
388388
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningLower, 8, 2, true, {INS_pmull, INS_pmull, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)
389389
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningUpper, 16, 2, true, {INS_pmull2, INS_pmull2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)

0 commit comments

Comments
 (0)
Please sign in to comment.