@@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si
20506
20506
20507
20507
GenTree* bitMask;
20508
20508
20509
- bitMask = gtNewDconNode(-0.0, simdBaseType);
20510
- bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);
20511
-
20512
- return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
20509
+ if (simdBaseType == TYP_FLOAT)
20510
+ {
20511
+ bitMask = gtNewIconNode(0x7FFFFFFF);
20512
+ bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
20513
+ }
20514
+ else
20515
+ {
20516
+ bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
20517
+ bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
20518
+ }
20519
+ return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
20513
20520
}
20514
20521
20515
20522
NamedIntrinsic intrinsic = NI_Illegal;
@@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
20750
20757
}
20751
20758
}
20752
20759
}
20753
-
20754
- if (op == GT_AND_NOT)
20755
- {
20756
- // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20757
- needsReverseOps = true;
20758
- }
20759
20760
break;
20760
20761
}
20761
20762
#endif // TARGET_XARCH
@@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
20786
20787
20787
20788
if (intrinsic != NI_Illegal)
20788
20789
{
20790
+ if (op == GT_AND_NOT)
20791
+ {
20792
+ assert(fgNodeThreading == NodeThreading::LIR);
20793
+
20794
+ #if defined(TARGET_XARCH)
20795
+ // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
20796
+ // We specially handle this here since we're only producing a
20797
+ // native intrinsic node in LIR
20798
+
20799
+ std::swap(op1, op2);
20800
+ #endif // TARGET_XARCH
20801
+ }
20789
20802
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
20790
20803
}
20791
20804
20792
20805
switch (op)
20793
20806
{
20807
+ case GT_AND_NOT:
20808
+ {
20809
+ // Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
20810
+ // appropriately optimize around the individual operations being performed, particularly ~op2,
20811
+ // and produce overall better codegen.
20812
+ assert(fgNodeThreading != NodeThreading::LIR);
20813
+
20814
+ op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
20815
+ return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
20816
+ }
20817
+
20794
20818
#if defined(TARGET_XARCH)
20795
20819
case GT_RSZ:
20796
20820
{
@@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
20955
20979
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
20956
20980
}
20957
20981
20958
- // Validate we can't use AVX512F_VL_TernaryLogic here
20959
- assert(!canUseEvexEncodingDebugOnly());
20960
-
20961
20982
// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
20962
20983
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
20963
20984
widenedSimdBaseJitType, widenedSimdSize);
@@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
21922
21943
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
21923
21944
CORINFO_TYPE_INT, simdSize);
21924
21945
21925
- // Validate we can't use AVX512F_VL_TernaryLogic here
21926
- assert(!canUseEvexEncodingDebugOnly());
21927
-
21928
21946
op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
21929
21947
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
21930
21948
}
@@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
24315
24333
24316
24334
GenTree* vecCon2 = gtCloneExpr(vecCon1);
24317
24335
24318
- // Validate we can't use AVX512F_VL_TernaryLogic here
24319
- assert(!canUseEvexEncodingDebugOnly());
24320
-
24321
24336
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
24322
24337
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
24323
24338
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
@@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
24356
24371
24357
24372
GenTree* vecCon2 = gtCloneExpr(vecCon1);
24358
24373
24359
- // Validate we can't use AVX512F_VL_TernaryLogic here
24360
- assert(!canUseEvexEncodingDebugOnly());
24361
-
24362
24374
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
24363
24375
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
24364
24376
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
@@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
24460
24472
24461
24473
GenTree* vecCon2 = gtCloneExpr(vecCon1);
24462
24474
24463
- // Validate we can't use AVX512F_VL_TernaryLogic here
24464
- assert(!canUseEvexEncodingDebugOnly());
24465
-
24466
24475
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
24467
24476
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
24468
24477
@@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
24499
24508
24500
24509
GenTree* vecCon2 = gtCloneExpr(vecCon1);
24501
24510
24502
- // Validate we can't use AVX512F_VL_TernaryLogic here
24503
- assert(!canUseEvexEncodingDebugOnly());
24504
-
24505
24511
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
24506
24512
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
24507
24513
@@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
28120
28126
assert(!isScalar);
28121
28127
assert(op2->TypeIs(simdType));
28122
28128
28129
+ if (comp->fgNodeThreading != NodeThreading::LIR)
28130
+ {
28131
+ // We don't want to support creating AND_NOT nodes prior to LIR
28132
+ // as it can break important optimizations. We'll produces this
28133
+ // in lowering instead.
28134
+ break;
28135
+ }
28136
+
28123
28137
#if defined(TARGET_XARCH)
28124
28138
if (simdSize == 64)
28125
28139
{
@@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
29187
29201
return IsUserCall() && (operand == Op(2));
29188
29202
}
29189
29203
29204
+ #if defined(TARGET_XARCH)
29205
+ case NI_SSE_Xor:
29206
+ case NI_SSE2_Xor:
29207
+ case NI_AVX_Xor:
29208
+ case NI_AVX2_Xor:
29209
+ case NI_AVX512F_Xor:
29210
+ case NI_AVX512DQ_Xor:
29211
+ case NI_AVX10v1_V512_Xor:
29212
+ {
29213
+ // We recognize this as GT_NOT which can enable other optimizations
29214
+ assert(GetOperandCount() == 2);
29215
+ return vecCon->IsVectorAllBitsSet();
29216
+ }
29217
+ #endif // TARGET_XARCH
29218
+
29190
29219
default:
29191
29220
{
29192
29221
break;
@@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
29936
29965
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
29937
29966
{
29938
29967
#if defined(TARGET_XARCH)
29939
- assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
29968
+ assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
29969
+ HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
29940
29970
#elif defined(TARGET_ARM64)
29941
29971
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
29942
29972
#endif
@@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree
29980
30010
29981
30011
return 0;
29982
30012
}
29983
-
29984
- //------------------------------------------------------------------------
29985
- // GetTernaryControlByte: calculate the value of the control byte for ternary node
29986
- // with given logic nodes on the input.
29987
- //
29988
- // Return value: the value of the ternary control byte.
29989
- uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
29990
- {
29991
- // we assume we have a structure like:
29992
- /*
29993
- /- A
29994
- +- B
29995
- t1 = binary logical op1
29996
-
29997
- /- C
29998
- +- t1
29999
- t2 = binary logical op2
30000
- */
30001
-
30002
- // To calculate the control byte value:
30003
- // The way the constants work is we have three keys:
30004
- // * A: 0xF0
30005
- // * B: 0xCC
30006
- // * C: 0xAA
30007
- //
30008
- // To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
30009
- // wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
30010
- assert(second->Op(1) == this || second->Op(2) == this);
30011
- const uint8_t A = 0xF0;
30012
- const uint8_t B = 0xCC;
30013
- const uint8_t C = 0xAA;
30014
-
30015
- bool isScalar = false;
30016
-
30017
- genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
30018
- assert(!isScalar);
30019
-
30020
- genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
30021
- assert(!isScalar);
30022
-
30023
- uint8_t AB = 0;
30024
- uint8_t ABC = 0;
30025
-
30026
- if (firstOper == GT_AND)
30027
- {
30028
- AB = A & B;
30029
- }
30030
- else if (firstOper == GT_OR)
30031
- {
30032
- AB = A | B;
30033
- }
30034
- else if (firstOper == GT_XOR)
30035
- {
30036
- AB = A ^ B;
30037
- }
30038
- else
30039
- {
30040
- unreached();
30041
- }
30042
-
30043
- if (secondOper == GT_AND)
30044
- {
30045
- ABC = AB & C;
30046
- }
30047
- else if (secondOper == GT_OR)
30048
- {
30049
- ABC = AB | C;
30050
- }
30051
- else if (secondOper == GT_XOR)
30052
- {
30053
- ABC = AB ^ C;
30054
- }
30055
- else
30056
- {
30057
- unreached();
30058
- }
30059
-
30060
- return ABC;
30061
- }
30062
30013
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS
30063
30014
30064
30015
unsigned GenTreeLclFld::GetSize() const
@@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
30454
30405
bool isScalar = false;
30455
30406
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
30456
30407
30457
- #if defined(TARGET_XARCH)
30458
- if (oper == GT_AND_NOT)
30459
- {
30460
- // xarch does: ~op1 & op2, we need op1 & ~op2
30461
- std::swap(op1, op2);
30462
- }
30463
- #endif // TARGET_XARCH
30408
+ // We shouldn't find AND_NOT nodes since it should only be produced in lowering
30409
+ assert(oper != GT_AND_NOT);
30464
30410
30465
30411
GenTree* cnsNode = nullptr;
30466
30412
GenTree* otherNode = nullptr;
@@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
30973
30919
break;
30974
30920
}
30975
30921
30976
- case GT_AND_NOT:
30977
- {
30978
- // Handle `x & ~0 == x` and `0 & ~x == 0`
30979
- if (cnsNode->IsVectorZero())
30980
- {
30981
- if (cnsNode == op1)
30982
- {
30983
- resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30984
- break;
30985
- }
30986
- else
30987
- {
30988
- resultNode = otherNode;
30989
- }
30990
- break;
30991
- }
30992
-
30993
- // Handle `x & ~AllBitsSet == 0`
30994
- if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
30995
- {
30996
- resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
30997
- }
30998
- break;
30999
- }
31000
-
31001
30922
case GT_DIV:
31002
30923
{
31003
30924
if (varTypeIsFloating(simdBaseType))
@@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
31388
31309
{
31389
31310
switch (ni)
31390
31311
{
31391
- case NI_Vector128_ConditionalSelect:
31392
31312
#if defined(TARGET_XARCH)
31313
+ case NI_Vector128_ConditionalSelect:
31393
31314
case NI_Vector256_ConditionalSelect:
31394
31315
case NI_Vector512_ConditionalSelect:
31395
31316
#elif defined(TARGET_ARM64)
31396
- case NI_Vector64_ConditionalSelect :
31317
+ case NI_AdvSimd_BitwiseSelect :
31397
31318
case NI_Sve_ConditionalSelect:
31398
31319
#endif
31399
31320
{
0 commit comments