Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize "X == Y" to "(X ^ Y) == 0 for SIMD #93818

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/coreclr/jit/importervectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,12 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(

// Optimization: use a single load when byteLen equals simdSize.
// For code simplicity we always create nodes for two vectors case.
const bool useSingleVector = simdSize == byteLen;
return gtNewSimdCmpOpAllNode(GT_EQ, TYP_UBYTE, useSingleVector ? xor1 : orr, gtNewZeroConNode(simdType), baseType,
simdSize);
if (simdSize == byteLen)
{
return gtNewSimdCmpOpAllNode(GT_EQ, TYP_UBYTE, vec1, cnsVec1, baseType, simdSize);
}

return gtNewSimdCmpOpAllNode(GT_EQ, TYP_UBYTE, orr, gtNewZeroConNode(simdType), baseType, simdSize);

// Codegen example for byteLen=40 and OrdinalIgnoreCase mode with AVX:
//
Expand Down
184 changes: 101 additions & 83 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1780,123 +1780,141 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
GenTree* op2 = node->Op(2);
GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE;

if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) &&
comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) &&
!op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector))
{
// On SSE4.1 or higher we can optimize comparisons against zero to
// just use PTEST. We can't support it for floating-point, however,
// as it has both +0.0 and -0.0 where +0.0 == -0.0
if (!op2->IsVectorZero() && comp->IsBaselineVector512IsaSupportedOpportunistically() &&
comp->opts.OptimizationEnabled())
{
// Optimize "X == Y" to "(X ^ Y) == 0" when opts are enabled.
// Disable for pre-AVX512 hardware as we hit regressions: https://github.com/dotnet/runtime/pull/67902
GenTree* zeroVec = comp->gtNewZeroConNode(simdType);
GenTree* xorVec = comp->gtNewSimdBinOpNode(GT_XOR, simdType, op1, op2, simdBaseJitType, simdSize);
node->Op(1) = xorVec;
node->Op(2) = zeroVec;
BlockRange().InsertBefore(node, xorVec, zeroVec);

bool skipReplaceOperands = false;
// We'll re-visit the comparison node again
return xorVec;
}

if (op1->OperIsHWIntrinsic())
if (op2->IsVectorZero())
{
GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
// On SSE4.1 or higher we can optimize comparisons against zero to
// just use PTEST. We can't support it for floating-point, however,
// as it has both +0.0 and -0.0 where +0.0 == -0.0

GenTree* nestedOp1 = nullptr;
GenTree* nestedOp2 = nullptr;
bool isEmbeddedBroadcast = false;
bool skipReplaceOperands = false;

if (op1Intrinsic->GetOperandCount() == 2)
if (op1->OperIsHWIntrinsic())
{
nestedOp1 = op1Intrinsic->Op(1);
nestedOp2 = op1Intrinsic->Op(2);
GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();

assert(!nestedOp1->isContained());
isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
}
GenTree* nestedOp1 = nullptr;
GenTree* nestedOp2 = nullptr;
bool isEmbeddedBroadcast = false;

switch (op1IntrinsicId)
{
case NI_SSE_And:
case NI_SSE2_And:
case NI_AVX_And:
case NI_AVX2_And:
if (op1Intrinsic->GetOperandCount() == 2)
{
// We can optimize to TestZ(op1.op1, op1.op2)
nestedOp1 = op1Intrinsic->Op(1);
nestedOp2 = op1Intrinsic->Op(2);

if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}
assert(!nestedOp1->isContained());
isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
}

node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;
switch (op1IntrinsicId)
{
case NI_SSE_And:
case NI_SSE2_And:
case NI_AVX_And:
case NI_AVX2_And:
{
// We can optimize to TestZ(op1.op1, op1.op2)

BlockRange().Remove(op1);
BlockRange().Remove(op2);
if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}

skipReplaceOperands = true;
break;
}
node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;

case NI_SSE_AndNot:
case NI_SSE2_AndNot:
case NI_AVX_AndNot:
case NI_AVX2_AndNot:
{
// We can optimize to TestC(op1.op1, op1.op2)
BlockRange().Remove(op1);
BlockRange().Remove(op2);

if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
skipReplaceOperands = true;
break;
}

cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
case NI_SSE_AndNot:
case NI_SSE2_AndNot:
case NI_AVX_AndNot:
case NI_AVX2_AndNot:
{
// We can optimize to TestC(op1.op1, op1.op2)

node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;
if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}

BlockRange().Remove(op1);
BlockRange().Remove(op2);
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;

skipReplaceOperands = true;
break;
}
node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;

default:
{
break;
BlockRange().Remove(op1);
BlockRange().Remove(op2);

skipReplaceOperands = true;
break;
}

default:
{
break;
}
}
}
}

if (!skipReplaceOperands)
{
// Default handler, emit a TestZ(op1, op1)
if (!skipReplaceOperands)
{
// Default handler, emit a TestZ(op1, op1)

node->Op(1) = op1;
BlockRange().Remove(op2);
node->Op(1) = op1;
BlockRange().Remove(op2);

LIR::Use op1Use(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(op1Use);
op1 = node->Op(1);
LIR::Use op1Use(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(op1Use);
op1 = node->Op(1);

op2 = comp->gtClone(op1);
BlockRange().InsertAfter(op1, op2);
node->Op(2) = op2;
}
op2 = comp->gtClone(op1);
BlockRange().InsertAfter(op1, op2);
node->Op(2) = op2;
}

if (simdSize == 32)
{
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_AVX_TestZ);
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
}
else
{
assert(simdSize == 16);
if (simdSize == 32)
{
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_AVX_TestZ);
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
}
else
{
assert(simdSize == 16);

// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
}
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
}

return LowerNode(node);
return LowerNode(node);
}
}

// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
Expand Down
Loading