Skip to content

Commit 33d95dd

Browse files
committed
Optimize comparisons against AllBitsSet on pre-AVX512 hardware
1 parent cb75246 commit 33d95dd

File tree

1 file changed

+97
-88
lines changed

1 file changed

+97
-88
lines changed

src/coreclr/jit/lowerxarch.cpp

+97-88
Original file line numberDiff line numberDiff line change
@@ -2488,7 +2488,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
24882488
CorInfoType maskBaseJitType = simdBaseJitType;
24892489
var_types maskBaseType = simdBaseType;
24902490

2491-
if (op1Msk->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector))
2491+
if (op1Msk->OperIsConvertMaskToVector())
24922492
{
24932493
GenTreeHWIntrinsic* cvtMaskToVector = op1Msk->AsHWIntrinsic();
24942494

@@ -2499,122 +2499,131 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
24992499
maskBaseType = cvtMaskToVector->GetSimdBaseType();
25002500
}
25012501

2502-
if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
2503-
comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) && !varTypeIsMask(op1Msk))
2502+
if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && !varTypeIsMask(op1Msk))
25042503
{
2505-
// On SSE4.1 or higher we can optimize comparisons against zero to
2506-
// just use PTEST. We can't support it for floating-point, however,
2507-
// as it has both +0.0 and -0.0 where +0.0 == -0.0
2504+
bool isOp2VectorZero = op2->IsVectorZero();
25082505

2509-
bool skipReplaceOperands = false;
2510-
2511-
if (op1->OperIsHWIntrinsic())
2506+
if ((isOp2VectorZero || op2->IsVectorAllBitsSet()) &&
2507+
comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
25122508
{
2513-
GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
2514-
NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
2509+
// On SSE4.1 or higher we can optimize comparisons against Zero or AllBitsSet to
2510+
// just use PTEST. We can't support it for floating-point, however, as it has
2511+
// both +0.0 and -0.0 where +0.0 == -0.0
25152512

2516-
GenTree* nestedOp1 = nullptr;
2517-
GenTree* nestedOp2 = nullptr;
2518-
bool isEmbeddedBroadcast = false;
2513+
bool skipReplaceOperands = false;
25192514

2520-
if (op1Intrinsic->GetOperandCount() == 2)
2515+
if (!isOp2VectorZero)
25212516
{
2522-
nestedOp1 = op1Intrinsic->Op(1);
2523-
nestedOp2 = op1Intrinsic->Op(2);
2517+
// We can optimize to TestC(op1, allbitsset)
2518+
//
2519+
// This works out because TestC sets CF if (~x & y) == 0, so:
2520+
// ~00 & 11 = 11; 11 & 11 = 11; NC
2521+
// ~01 & 11 = 01; 10 & 11 = 10; NC
2522+
// ~10 & 11 = 10; 01 & 11 = 01; NC
2523+
// ~11 & 11 = 11; 00 & 11 = 00; C
25242524

2525-
assert(!nestedOp1->isContained());
2526-
isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
2527-
}
2525+
assert(op2->IsVectorAllBitsSet());
2526+
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
25282527

2529-
switch (op1IntrinsicId)
2528+
skipReplaceOperands = true;
2529+
}
2530+
else if (op1->OperIsHWIntrinsic())
25302531
{
2531-
case NI_SSE_And:
2532-
case NI_SSE2_And:
2533-
case NI_AVX_And:
2534-
case NI_AVX2_And:
2532+
assert(op2->IsVectorZero());
2533+
2534+
GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
2535+
2536+
if (op1Intrinsic->GetOperandCount() == 2)
25352537
{
2536-
// We can optimize to TestZ(op1.op1, op1.op2)
2538+
GenTree* nestedOp1 = op1Intrinsic->Op(1);
2539+
GenTree* nestedOp2 = op1Intrinsic->Op(2);
2540+
2541+
assert(!nestedOp1->isContained());
2542+
bool isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
25372543

2538-
if (isEmbeddedBroadcast)
2544+
bool isScalar = false;
2545+
genTreeOps oper = op1Intrinsic->GetOperForHWIntrinsicId(&isScalar);
2546+
2547+
switch (oper)
25392548
{
2540-
// PTEST doesn't support embedded broadcast
2541-
break;
2542-
}
2549+
case GT_AND:
2550+
{
2551+
// We can optimize to TestZ(op1.op1, op1.op2)
25432552

2544-
node->Op(1) = nestedOp1;
2545-
node->Op(2) = nestedOp2;
2553+
if (isEmbeddedBroadcast)
2554+
{
2555+
// PTEST doesn't support embedded broadcast
2556+
break;
2557+
}
25462558

2547-
BlockRange().Remove(op1);
2548-
BlockRange().Remove(op2);
2559+
node->Op(1) = nestedOp1;
2560+
node->Op(2) = nestedOp2;
25492561

2550-
skipReplaceOperands = true;
2551-
break;
2552-
}
2562+
BlockRange().Remove(op1);
2563+
BlockRange().Remove(op2);
25532564

2554-
case NI_SSE_AndNot:
2555-
case NI_SSE2_AndNot:
2556-
case NI_AVX_AndNot:
2557-
case NI_AVX2_AndNot:
2558-
{
2559-
// We can optimize to TestC(op1.op1, op1.op2)
2565+
skipReplaceOperands = true;
2566+
break;
2567+
}
25602568

2561-
if (isEmbeddedBroadcast)
2562-
{
2563-
// PTEST doesn't support embedded broadcast
2564-
break;
2565-
}
2569+
case GT_AND_NOT:
2570+
{
2571+
// We can optimize to TestC(op1.op1, op1.op2)
2572+
2573+
if (isEmbeddedBroadcast)
2574+
{
2575+
// PTEST doesn't support embedded broadcast
2576+
break;
2577+
}
25662578

2567-
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2579+
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
25682580

2569-
node->Op(1) = nestedOp1;
2570-
node->Op(2) = nestedOp2;
2581+
node->Op(1) = nestedOp1;
2582+
node->Op(2) = nestedOp2;
25712583

2572-
BlockRange().Remove(op1);
2573-
BlockRange().Remove(op2);
2584+
BlockRange().Remove(op1);
2585+
BlockRange().Remove(op2);
25742586

2575-
skipReplaceOperands = true;
2576-
break;
2577-
}
2587+
skipReplaceOperands = true;
2588+
break;
2589+
}
25782590

2579-
default:
2580-
{
2581-
break;
2591+
default:
2592+
{
2593+
break;
2594+
}
2595+
}
25822596
}
25832597
}
2584-
}
2585-
2586-
if (!skipReplaceOperands)
2587-
{
2588-
// Default handler, emit a TestZ(op1, op1)
25892598

2590-
node->Op(1) = op1;
2591-
BlockRange().Remove(op2);
2599+
if (!skipReplaceOperands)
2600+
{
2601+
// Default handler, emit a TestZ(op1, op1)
2602+
assert(op2->IsVectorZero());
25922603

2593-
LIR::Use op1Use(BlockRange(), &node->Op(1), node);
2594-
ReplaceWithLclVar(op1Use);
2595-
op1 = node->Op(1);
2604+
node->Op(1) = op1;
2605+
BlockRange().Remove(op2);
25962606

2597-
op2 = comp->gtClone(op1);
2598-
BlockRange().InsertAfter(op1, op2);
2599-
node->Op(2) = op2;
2600-
}
2607+
LIR::Use op1Use(BlockRange(), &node->Op(1), node);
2608+
ReplaceWithLclVar(op1Use);
2609+
op1 = node->Op(1);
26012610

2602-
if (simdSize == 32)
2603-
{
2604-
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2605-
node->ChangeHWIntrinsicId(NI_AVX_TestZ);
2606-
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
2607-
}
2608-
else
2609-
{
2610-
assert(simdSize == 16);
2611+
op2 = comp->gtClone(op1);
2612+
BlockRange().InsertAfter(op1, op2);
2613+
node->Op(2) = op2;
2614+
}
26112615

2612-
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2613-
node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
2614-
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
2616+
if (simdSize == 32)
2617+
{
2618+
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
2619+
}
2620+
else
2621+
{
2622+
assert(simdSize == 16);
2623+
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
2624+
}
2625+
return LowerNode(node);
26152626
}
2616-
2617-
return LowerNode(node);
26182627
}
26192628

26202629
// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
@@ -3490,7 +3499,7 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node)
34903499
}
34913500
}
34923501

3493-
if (condition->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector))
3502+
if (condition->OperIsConvertMaskToVector())
34943503
{
34953504
GenTree* tmp = condition->AsHWIntrinsic()->Op(1);
34963505
BlockRange().Remove(condition);

0 commit comments

Comments
 (0)