Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Vector128/256.Equals via TestZ #55875

Merged
merged 11 commits into from
Oct 4, 2021
29 changes: 27 additions & 2 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20219,8 +20219,33 @@ GenTree* Compiler::impInlineFetchArg(unsigned lclNum, InlArgInfo* inlArgInfo, In
// TODO-1stClassStructs: We currently do not reuse an existing lclVar
// if it is a struct, because it requires some additional handling.

if (!varTypeIsStruct(lclTyp) && !argInfo.argHasSideEff && !argInfo.argHasGlobRef &&
!argInfo.argHasCallerLocalRef)
bool substitute = false;
switch (argNode->OperGet())
{
#ifdef FEATURE_HW_INTRINSICS
case GT_HWINTRINSIC:
{
// Enable for all parameterless (=invariant) hw intrinsics such as
// Vector128<>.Empty and Vector256<>.AllBitSets. We might consider
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
// doing that for Vector.Create(cns) as well.
if ((argNode->gtGetOp1() == nullptr) && (argNode->gtGetOp2() == nullptr))
{
substitute = true;
}
break;
}
#endif

// TODO: Enable substitution for CORINFO_HELP_TYPEHANDLE_TO_RUNTIMETYPE (typeof(T))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an existing issue for this TODO?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's why I want to merge it - to start experimenting with it 🙂 the issue is #40381

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I just wanted to make sure we had some github issue corresponding to the TODO, so its not just a comment we'll forget about

// but in order to benefit from that, we need to move various "typeof + IsValueType"
// optimizations from importer to morph.

default:
break;
}

if (substitute || (!varTypeIsStruct(lclTyp) && !argInfo.argHasSideEff && !argInfo.argHasGlobRef &&
!argInfo.argHasCallerLocalRef))
{
/* Get a *LARGE* LCL_VAR node */
op1 = gtNewLclLNode(tmpNum, genActualType(lclTyp) DEBUGARG(lclNum));
Expand Down
49 changes: 49 additions & 0 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14290,6 +14290,55 @@ GenTree* Compiler::fgMorphSmpOpOptional(GenTreeOp* tree)
}
break;

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
case GT_HWINTRINSIC:
{
GenTreeHWIntrinsic* hw = tree->AsHWIntrinsic();
switch (hw->gtHWIntrinsicId)
{
case NI_SSE_Xor:
case NI_SSE2_Xor:
case NI_AVX_Xor:
case NI_AVX2_Xor:
{
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
// Optimize Xor(x, Vector_.Zero) to just x
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
GenTree* op1 = hw->gtGetOp1();
GenTree* op2 = hw->gtGetOp2();

// Is node - Vector_.Zero ?
auto isHwZero = [](GenTree* node) -> bool {
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
if (node->OperIs(GT_HWINTRINSIC))
{
NamedIntrinsic ni = node->AsHWIntrinsic()->gtHWIntrinsicId;
return (ni == NI_Vector128_get_Zero) || (ni == NI_Vector256_get_Zero);
}
return false;
};

if (isHwZero(op1))
{
DEBUG_DESTROY_NODE(tree);
DEBUG_DESTROY_NODE(op1);
INDEBUG(op2->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
return op2;
}
if (isHwZero(op2))
{
DEBUG_DESTROY_NODE(tree);
DEBUG_DESTROY_NODE(op2);
INDEBUG(op1->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
return op1;
}
break;
}

default:
break;
}
break;
}
#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)

default:
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,15 @@ public bool Equals(Vector128<T> other)
return Sse.MoveMask(result) == 0b1111; // We have one bit per element
}

if (Sse41.IsSupported && (typeof(T) != typeof(double)))
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
{
Debug.Assert(typeof(T) != typeof(float));

// xor + testz is slightly better for integer types
Vector128<byte> xored = Sse2.Xor(this.AsByte(), other.AsByte());
return Sse41.TestZ(xored, xored);
}

if (Sse2.IsSupported)
{
if (typeof(T) == typeof(double))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ public bool Equals(Vector256<T> other)
// bytes are exactly the same.

Debug.Assert((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)));
Vector256<byte> result = Avx2.CompareEqual(this.AsByte(), other.AsByte());
return Avx2.MoveMask(result) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); // We have one bit per element

Vector256<byte> xored = Avx2.Xor(this.AsByte(), other.AsByte());
return Avx.TestZ(xored, xored);
}

return SoftwareFallback(in this, other);
Expand Down