@@ -2488,7 +2488,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
2488
2488
CorInfoType maskBaseJitType = simdBaseJitType;
2489
2489
var_types maskBaseType = simdBaseType;
2490
2490
2491
- if (op1Msk->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector ))
2491
+ if (op1Msk->OperIsConvertMaskToVector( ))
2492
2492
{
2493
2493
GenTreeHWIntrinsic* cvtMaskToVector = op1Msk->AsHWIntrinsic();
2494
2494
@@ -2499,122 +2499,131 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
2499
2499
maskBaseType = cvtMaskToVector->GetSimdBaseType();
2500
2500
}
2501
2501
2502
- if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
2503
- comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) && !varTypeIsMask(op1Msk))
2502
+ if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && !varTypeIsMask(op1Msk))
2504
2503
{
2505
- // On SSE4.1 or higher we can optimize comparisons against zero to
2506
- // just use PTEST. We can't support it for floating-point, however,
2507
- // as it has both +0.0 and -0.0 where +0.0 == -0.0
2504
+ bool isOp2VectorZero = op2->IsVectorZero();
2508
2505
2509
- bool skipReplaceOperands = false;
2510
-
2511
- if (op1->OperIsHWIntrinsic())
2506
+ if ((isOp2VectorZero || op2->IsVectorAllBitsSet()) &&
2507
+ comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
2512
2508
{
2513
- GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
2514
- NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
2509
+ // On SSE4.1 or higher we can optimize comparisons against Zero or AllBitsSet to
2510
+ // just use PTEST. We can't support it for floating-point, however, as it has
2511
+ // both +0.0 and -0.0 where +0.0 == -0.0
2515
2512
2516
- GenTree* nestedOp1 = nullptr;
2517
- GenTree* nestedOp2 = nullptr;
2518
- bool isEmbeddedBroadcast = false;
2513
+ bool skipReplaceOperands = false;
2519
2514
2520
- if (op1Intrinsic->GetOperandCount() == 2 )
2515
+ if (!isOp2VectorZero )
2521
2516
{
2522
- nestedOp1 = op1Intrinsic->Op(1);
2523
- nestedOp2 = op1Intrinsic->Op(2);
2517
+ // We can optimize to TestC(op1, allbitsset)
2518
+ //
2519
+ // This works out because TestC sets CF if (~x & y) == 0, so:
2520
+ // ~00 & 11 = 11; 11 & 11 = 11; NC
2521
+ // ~01 & 11 = 01; 10 & 11 = 10; NC
2522
+ // ~10 & 11 = 10; 01 & 11 = 01; NC
2523
+ // ~11 & 11 = 11; 00 & 11 = 00; C
2524
2524
2525
- assert(!nestedOp1->isContained());
2526
- isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
2527
- }
2525
+ assert(op2->IsVectorAllBitsSet());
2526
+ cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2528
2527
2529
- switch (op1IntrinsicId)
2528
+ skipReplaceOperands = true;
2529
+ }
2530
+ else if (op1->OperIsHWIntrinsic())
2530
2531
{
2531
- case NI_SSE_And:
2532
- case NI_SSE2_And:
2533
- case NI_AVX_And:
2534
- case NI_AVX2_And:
2532
+ assert(op2->IsVectorZero());
2533
+
2534
+ GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
2535
+
2536
+ if (op1Intrinsic->GetOperandCount() == 2)
2535
2537
{
2536
- // We can optimize to TestZ(op1.op1, op1.op2)
2538
+ GenTree* nestedOp1 = op1Intrinsic->Op(1);
2539
+ GenTree* nestedOp2 = op1Intrinsic->Op(2);
2540
+
2541
+ assert(!nestedOp1->isContained());
2542
+ bool isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
2537
2543
2538
- if (isEmbeddedBroadcast)
2544
+ bool isScalar = false;
2545
+ genTreeOps oper = op1Intrinsic->GetOperForHWIntrinsicId(&isScalar);
2546
+
2547
+ switch (oper)
2539
2548
{
2540
- // PTEST doesn't support embedded broadcast
2541
- break;
2542
- }
2549
+ case GT_AND:
2550
+ {
2551
+ // We can optimize to TestZ(op1.op1, op1.op2)
2543
2552
2544
- node->Op(1) = nestedOp1;
2545
- node->Op(2) = nestedOp2;
2553
+ if (isEmbeddedBroadcast)
2554
+ {
2555
+ // PTEST doesn't support embedded broadcast
2556
+ break;
2557
+ }
2546
2558
2547
- BlockRange().Remove(op1) ;
2548
- BlockRange().Remove(op2) ;
2559
+ node->Op(1) = nestedOp1 ;
2560
+ node->Op(2) = nestedOp2 ;
2549
2561
2550
- skipReplaceOperands = true;
2551
- break;
2552
- }
2562
+ BlockRange().Remove(op1);
2563
+ BlockRange().Remove(op2);
2553
2564
2554
- case NI_SSE_AndNot:
2555
- case NI_SSE2_AndNot:
2556
- case NI_AVX_AndNot:
2557
- case NI_AVX2_AndNot:
2558
- {
2559
- // We can optimize to TestC(op1.op1, op1.op2)
2565
+ skipReplaceOperands = true;
2566
+ break;
2567
+ }
2560
2568
2561
- if (isEmbeddedBroadcast)
2562
- {
2563
- // PTEST doesn't support embedded broadcast
2564
- break;
2565
- }
2569
+ case GT_AND_NOT:
2570
+ {
2571
+ // We can optimize to TestC(op1.op1, op1.op2)
2572
+
2573
+ if (isEmbeddedBroadcast)
2574
+ {
2575
+ // PTEST doesn't support embedded broadcast
2576
+ break;
2577
+ }
2566
2578
2567
- cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2579
+ cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2568
2580
2569
- node->Op(1) = nestedOp1;
2570
- node->Op(2) = nestedOp2;
2581
+ node->Op(1) = nestedOp1;
2582
+ node->Op(2) = nestedOp2;
2571
2583
2572
- BlockRange().Remove(op1);
2573
- BlockRange().Remove(op2);
2584
+ BlockRange().Remove(op1);
2585
+ BlockRange().Remove(op2);
2574
2586
2575
- skipReplaceOperands = true;
2576
- break;
2577
- }
2587
+ skipReplaceOperands = true;
2588
+ break;
2589
+ }
2578
2590
2579
- default:
2580
- {
2581
- break;
2591
+ default:
2592
+ {
2593
+ break;
2594
+ }
2595
+ }
2582
2596
}
2583
2597
}
2584
- }
2585
-
2586
- if (!skipReplaceOperands)
2587
- {
2588
- // Default handler, emit a TestZ(op1, op1)
2589
2598
2590
- node->Op(1) = op1;
2591
- BlockRange().Remove(op2);
2599
+ if (!skipReplaceOperands)
2600
+ {
2601
+ // Default handler, emit a TestZ(op1, op1)
2602
+ assert(op2->IsVectorZero());
2592
2603
2593
- LIR::Use op1Use(BlockRange(), &node->Op(1), node);
2594
- ReplaceWithLclVar(op1Use);
2595
- op1 = node->Op(1);
2604
+ node->Op(1) = op1;
2605
+ BlockRange().Remove(op2);
2596
2606
2597
- op2 = comp->gtClone(op1);
2598
- BlockRange().InsertAfter(op1, op2);
2599
- node->Op(2) = op2;
2600
- }
2607
+ LIR::Use op1Use(BlockRange(), &node->Op(1), node);
2608
+ ReplaceWithLclVar(op1Use);
2609
+ op1 = node->Op(1);
2601
2610
2602
- if (simdSize == 32)
2603
- {
2604
- // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2605
- node->ChangeHWIntrinsicId(NI_AVX_TestZ);
2606
- LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
2607
- }
2608
- else
2609
- {
2610
- assert(simdSize == 16);
2611
+ op2 = comp->gtClone(op1);
2612
+ BlockRange().InsertAfter(op1, op2);
2613
+ node->Op(2) = op2;
2614
+ }
2611
2615
2612
- // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2613
- node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
2614
- LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
2616
+ if (simdSize == 32)
2617
+ {
2618
+ LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
2619
+ }
2620
+ else
2621
+ {
2622
+ assert(simdSize == 16);
2623
+ LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
2624
+ }
2625
+ return LowerNode(node);
2615
2626
}
2616
-
2617
- return LowerNode(node);
2618
2627
}
2619
2628
2620
2629
// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
@@ -3490,7 +3499,7 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node)
3490
3499
}
3491
3500
}
3492
3501
3493
- if (condition->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector ))
3502
+ if (condition->OperIsConvertMaskToVector( ))
3494
3503
{
3495
3504
GenTree* tmp = condition->AsHWIntrinsic()->Op(1);
3496
3505
BlockRange().Remove(condition);
0 commit comments