Skip to content

Commit 6d3cb53

Browse files
Decompose some bitwise operations in HIR to allow more overall optimizations to kick in (#104517)
* Decompose some bitwise operations in HIR to allow more overall optimizations to kick in * Ensure that we actually remove the underlying op * Ensure the AND_NOT decomposition is still folded during import for minopts * Ensure we propagate AllBitsSet into simd GT_XOR on xarch * Ensure that we prefer AndNot over TernaryLogic * Cleanup the TernaryLogic lowering code * Ensure that TernaryLogic picks the best operand for containment * Ensure we swap the operands that are being checked for containment * Ensure that TernaryLogic is simplified where possible * Apply formatting patch
1 parent 5336e18 commit 6d3cb53

14 files changed

+1469
-566
lines changed

src/coreclr/jit/gentree.cpp

+63-142
Original file line numberDiff line numberDiff line change
@@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si
2050620506

2050720507
GenTree* bitMask;
2050820508

20509-
bitMask = gtNewDconNode(-0.0, simdBaseType);
20510-
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);
20511-
20512-
return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
20509+
if (simdBaseType == TYP_FLOAT)
20510+
{
20511+
bitMask = gtNewIconNode(0x7FFFFFFF);
20512+
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
20513+
}
20514+
else
20515+
{
20516+
bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
20517+
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
20518+
}
20519+
return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
2051320520
}
2051420521

2051520522
NamedIntrinsic intrinsic = NI_Illegal;
@@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2075020757
}
2075120758
}
2075220759
}
20753-
20754-
if (op == GT_AND_NOT)
20755-
{
20756-
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20757-
needsReverseOps = true;
20758-
}
2075920760
break;
2076020761
}
2076120762
#endif // TARGET_XARCH
@@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2078620787

2078720788
if (intrinsic != NI_Illegal)
2078820789
{
20790+
if (op == GT_AND_NOT)
20791+
{
20792+
assert(fgNodeThreading == NodeThreading::LIR);
20793+
20794+
#if defined(TARGET_XARCH)
20795+
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20796+
// We specially handle this here since we're only producing a
20797+
// native intrinsic node in LIR
20798+
20799+
std::swap(op1, op2);
20800+
#endif // TARGET_XARCH
20801+
}
2078920802
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
2079020803
}
2079120804

2079220805
switch (op)
2079320806
{
20807+
case GT_AND_NOT:
20808+
{
20809+
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
20810+
// appropriately optimize around the individual operations being performed, particularly ~op2,
20811+
// and produce overall better codegen.
20812+
assert(fgNodeThreading != NodeThreading::LIR);
20813+
20814+
op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
20815+
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
20816+
}
20817+
2079420818
#if defined(TARGET_XARCH)
2079520819
case GT_RSZ:
2079620820
{
@@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2095520979
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
2095620980
}
2095720981

20958-
// Validate we can't use AVX512F_VL_TernaryLogic here
20959-
assert(!canUseEvexEncodingDebugOnly());
20960-
2096120982
// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
2096220983
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
2096320984
widenedSimdBaseJitType, widenedSimdSize);
@@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
2192221943
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
2192321944
CORINFO_TYPE_INT, simdSize);
2192421945

21925-
// Validate we can't use AVX512F_VL_TernaryLogic here
21926-
assert(!canUseEvexEncodingDebugOnly());
21927-
2192821946
op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
2192921947
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
2193021948
}
@@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2431524333

2431624334
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2431724335

24318-
// Validate we can't use AVX512F_VL_TernaryLogic here
24319-
assert(!canUseEvexEncodingDebugOnly());
24320-
2432124336
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2432224337
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2432324338
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
@@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2435624371

2435724372
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2435824373

24359-
// Validate we can't use AVX512F_VL_TernaryLogic here
24360-
assert(!canUseEvexEncodingDebugOnly());
24361-
2436224374
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2436324375
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2436424376
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
@@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2446024472

2446124473
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2446224474

24463-
// Validate we can't use AVX512F_VL_TernaryLogic here
24464-
assert(!canUseEvexEncodingDebugOnly());
24465-
2446624475
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2446724476
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2446824477

@@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
2449924508

2450024509
GenTree* vecCon2 = gtCloneExpr(vecCon1);
2450124510

24502-
// Validate we can't use AVX512F_VL_TernaryLogic here
24503-
assert(!canUseEvexEncodingDebugOnly());
24504-
2450524511
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
2450624512
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
2450724513

@@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
2812028126
assert(!isScalar);
2812128127
assert(op2->TypeIs(simdType));
2812228128

28129+
if (comp->fgNodeThreading != NodeThreading::LIR)
28130+
{
28131+
// We don't want to support creating AND_NOT nodes prior to LIR
28132+
// as it can break important optimizations. We'll produces this
28133+
// in lowering instead.
28134+
break;
28135+
}
28136+
2812328137
#if defined(TARGET_XARCH)
2812428138
if (simdSize == 64)
2812528139
{
@@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
2918729201
return IsUserCall() && (operand == Op(2));
2918829202
}
2918929203

29204+
#if defined(TARGET_XARCH)
29205+
case NI_SSE_Xor:
29206+
case NI_SSE2_Xor:
29207+
case NI_AVX_Xor:
29208+
case NI_AVX2_Xor:
29209+
case NI_AVX512F_Xor:
29210+
case NI_AVX512DQ_Xor:
29211+
case NI_AVX10v1_V512_Xor:
29212+
{
29213+
// We recognize this as GT_NOT which can enable other optimizations
29214+
assert(GetOperandCount() == 2);
29215+
return vecCon->IsVectorAllBitsSet();
29216+
}
29217+
#endif // TARGET_XARCH
29218+
2919029219
default:
2919129220
{
2919229221
break;
@@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
2993629965
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
2993729966
{
2993829967
#if defined(TARGET_XARCH)
29939-
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
29968+
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
29969+
HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
2994029970
#elif defined(TARGET_ARM64)
2994129971
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
2994229972
#endif
@@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree
2998030010

2998130011
return 0;
2998230012
}
29983-
29984-
//------------------------------------------------------------------------
29985-
// GetTernaryControlByte: calculate the value of the control byte for ternary node
29986-
// with given logic nodes on the input.
29987-
//
29988-
// Return value: the value of the ternary control byte.
29989-
uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
29990-
{
29991-
// we assume we have a structure like:
29992-
/*
29993-
/- A
29994-
+- B
29995-
t1 = binary logical op1
29996-
29997-
/- C
29998-
+- t1
29999-
t2 = binary logical op2
30000-
*/
30001-
30002-
// To calculate the control byte value:
30003-
// The way the constants work is we have three keys:
30004-
// * A: 0xF0
30005-
// * B: 0xCC
30006-
// * C: 0xAA
30007-
//
30008-
// To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
30009-
// wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
30010-
assert(second->Op(1) == this || second->Op(2) == this);
30011-
const uint8_t A = 0xF0;
30012-
const uint8_t B = 0xCC;
30013-
const uint8_t C = 0xAA;
30014-
30015-
bool isScalar = false;
30016-
30017-
genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
30018-
assert(!isScalar);
30019-
30020-
genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
30021-
assert(!isScalar);
30022-
30023-
uint8_t AB = 0;
30024-
uint8_t ABC = 0;
30025-
30026-
if (firstOper == GT_AND)
30027-
{
30028-
AB = A & B;
30029-
}
30030-
else if (firstOper == GT_OR)
30031-
{
30032-
AB = A | B;
30033-
}
30034-
else if (firstOper == GT_XOR)
30035-
{
30036-
AB = A ^ B;
30037-
}
30038-
else
30039-
{
30040-
unreached();
30041-
}
30042-
30043-
if (secondOper == GT_AND)
30044-
{
30045-
ABC = AB & C;
30046-
}
30047-
else if (secondOper == GT_OR)
30048-
{
30049-
ABC = AB | C;
30050-
}
30051-
else if (secondOper == GT_XOR)
30052-
{
30053-
ABC = AB ^ C;
30054-
}
30055-
else
30056-
{
30057-
unreached();
30058-
}
30059-
30060-
return ABC;
30061-
}
3006230013
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS
3006330014

3006430015
unsigned GenTreeLclFld::GetSize() const
@@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3045430405
bool isScalar = false;
3045530406
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
3045630407

30457-
#if defined(TARGET_XARCH)
30458-
if (oper == GT_AND_NOT)
30459-
{
30460-
// xarch does: ~op1 & op2, we need op1 & ~op2
30461-
std::swap(op1, op2);
30462-
}
30463-
#endif // TARGET_XARCH
30408+
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
30409+
assert(oper != GT_AND_NOT);
3046430410

3046530411
GenTree* cnsNode = nullptr;
3046630412
GenTree* otherNode = nullptr;
@@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3097330919
break;
3097430920
}
3097530921

30976-
case GT_AND_NOT:
30977-
{
30978-
// Handle `x & ~0 == x` and `0 & ~x == 0`
30979-
if (cnsNode->IsVectorZero())
30980-
{
30981-
if (cnsNode == op1)
30982-
{
30983-
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30984-
break;
30985-
}
30986-
else
30987-
{
30988-
resultNode = otherNode;
30989-
}
30990-
break;
30991-
}
30992-
30993-
// Handle `x & ~AllBitsSet == 0`
30994-
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
30995-
{
30996-
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30997-
}
30998-
break;
30999-
}
31000-
3100130922
case GT_DIV:
3100230923
{
3100330924
if (varTypeIsFloating(simdBaseType))
@@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3138831309
{
3138931310
switch (ni)
3139031311
{
31391-
case NI_Vector128_ConditionalSelect:
3139231312
#if defined(TARGET_XARCH)
31313+
case NI_Vector128_ConditionalSelect:
3139331314
case NI_Vector256_ConditionalSelect:
3139431315
case NI_Vector512_ConditionalSelect:
3139531316
#elif defined(TARGET_ARM64)
31396-
case NI_Vector64_ConditionalSelect:
31317+
case NI_AdvSimd_BitwiseSelect:
3139731318
case NI_Sve_ConditionalSelect:
3139831319
#endif
3139931320
{

src/coreclr/jit/gentree.h

-1
Original file line numberDiff line numberDiff line change
@@ -6527,7 +6527,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
65276527
bool OperRequiresGlobRefFlag() const;
65286528

65296529
unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
6530-
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;
65316530

65326531
ClassLayout* GetLayout(Compiler* compiler) const;
65336532

0 commit comments

Comments
 (0)