Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Optimize SequenceEqual to use ccmp on ARM64 #92810

Merged
merged 2 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 83 additions & 62 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1798,11 +1798,12 @@ GenTree* Lowering::AddrGen(void* addr)
//
// Arguments:
// tree - GenTreeCall node to replace with STORE_BLK
// next - [out] Next node to lower if this function returns true
//
// Return Value:
// nullptr if no changes were made
// false if no changes were made
//
GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
bool Lowering::LowerCallMemmove(GenTreeCall* call, GenTree** next)
{
JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call))
assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove);
Expand All @@ -1812,7 +1813,7 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
if (comp->info.compHasNextCallRetAddr)
{
JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
return nullptr;
return false;
}

GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
Expand Down Expand Up @@ -1857,7 +1858,9 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)

JITDUMP("\nNew tree:\n")
DISPTREE(storeBlk);
return storeBlk;
// TODO: This skips lowering srcBlk and storeBlk.
*next = storeBlk->gtNext;
return true;
}
else
{
Expand All @@ -1868,7 +1871,7 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
{
JITDUMP("size is not a constant.\n")
}
return nullptr;
return false;
}

//------------------------------------------------------------------------
Expand All @@ -1877,11 +1880,12 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
//
// Arguments:
// tree - GenTreeCall node to unroll as memcmp
// next - [out] Next node to lower if this function returns true
//
// Return Value:
// nullptr if no changes were made
// false if no changes were made
//
GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
bool Lowering::LowerCallMemcmp(GenTreeCall* call, GenTree** next)
{
JITDUMP("Considering Memcmp [%06d] for unrolling.. ", comp->dspTreeID(call))
assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_SpanHelpers_SequenceEqual);
Expand All @@ -1891,13 +1895,13 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
if (!comp->opts.OptimizationEnabled())
{
JITDUMP("Optimizations aren't allowed - bail out.\n")
return nullptr;
return false;
}

if (comp->info.compHasNextCallRetAddr)
{
JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
return nullptr;
return false;
}

GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
Expand Down Expand Up @@ -2004,9 +2008,8 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
GenTree* rIndir = comp->gtNewIndir(loadType, rArg);
result = newBinaryOp(comp, GT_EQ, TYP_INT, lIndir, rIndir);

BlockRange().InsertAfter(lArg, lIndir);
BlockRange().InsertAfter(rArg, rIndir);
BlockRange().InsertBefore(call, result);
BlockRange().InsertBefore(call, lIndir, rIndir, result);
*next = lIndir;
}
else
{
Expand All @@ -2020,51 +2023,77 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
GenTree* rArgClone = comp->gtNewLclvNode(rArgUse.ReplaceWithLclVar(comp), genActualType(rArg));
BlockRange().InsertBefore(call, lArgClone, rArgClone);

// We're going to emit something like the following:
//
// bool result = ((*(int*)leftArg ^ *(int*)rightArg) |
// (*(int*)(leftArg + 1) ^ *((int*)(rightArg + 1)))) == 0;
//
// ^ in the given example we unroll for length=5
//
// In IR:
//
// * EQ int
// +--* OR int
// | +--* XOR int
// | | +--* IND int
// | | | \--* LCL_VAR byref V1
// | | \--* IND int
// | | \--* LCL_VAR byref V2
// | \--* XOR int
// | +--* IND int
// | | \--* ADD byref
// | | +--* LCL_VAR byref V1
// | | \--* CNS_INT int 1
// | \--* IND int
// | \--* ADD byref
// | +--* LCL_VAR byref V2
// | \--* CNS_INT int 1
// \--* CNS_INT int 0
//
*next = lArgClone;

GenTree* l1Indir = comp->gtNewIndir(loadType, lArgUse.Def());
GenTree* r1Indir = comp->gtNewIndir(loadType, rArgUse.Def());
GenTree* lXor = newBinaryOp(comp, GT_XOR, actualLoadType, l1Indir, r1Indir);
GenTree* l2Offs = comp->gtNewIconNode(cnsSize - loadWidth, TYP_I_IMPL);
GenTree* l2AddOffs = newBinaryOp(comp, GT_ADD, lArg->TypeGet(), lArgClone, l2Offs);
GenTree* l2Indir = comp->gtNewIndir(loadType, l2AddOffs);
GenTree* r2Offs = comp->gtCloneExpr(l2Offs); // offset is the same
GenTree* r2Offs = comp->gtNewIconNode(cnsSize - loadWidth, TYP_I_IMPL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

motivation? 🙂

Copy link
Member Author

@jakobbotsch jakobbotsch Sep 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No strong motivation... just that we were already creating all the other nodes from scratch so it looked a bit out of place to me to clone one lone one. I can revert it if you prefer

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with this, was just wondering if you were improving TP impact or something like that

GenTree* r2AddOffs = newBinaryOp(comp, GT_ADD, rArg->TypeGet(), rArgClone, r2Offs);
GenTree* r2Indir = comp->gtNewIndir(loadType, r2AddOffs);
GenTree* rXor = newBinaryOp(comp, GT_XOR, actualLoadType, l2Indir, r2Indir);
GenTree* resultOr = newBinaryOp(comp, GT_OR, actualLoadType, lXor, rXor);
GenTree* zeroCns = comp->gtNewZeroConNode(actualLoadType);
result = newBinaryOp(comp, GT_EQ, TYP_INT, resultOr, zeroCns);

BlockRange().InsertAfter(rArgClone, l1Indir, l2Offs, l2AddOffs, l2Indir);
BlockRange().InsertAfter(l2Indir, r1Indir, r2Offs, r2AddOffs, r2Indir);
BlockRange().InsertAfter(r2Indir, lXor, rXor, resultOr, zeroCns);
BlockRange().InsertAfter(zeroCns, result);

#ifdef TARGET_ARM64
if (!varTypeIsSIMD(loadType))
{
// ARM64 will get efficient ccmp codegen if we emit the normal thing:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember we discussed this and it didn't want to emit ccmp due to some side effects or something like that (probably my PR that re-ordered INDirs fixed that?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#92710 fixed it

//
// bool result = (*(int*)leftArg == *(int)rightArg) & (*(int*)(leftArg + 1) == *(int*)(rightArg
// +
// 1))

GenTree* eq1 = newBinaryOp(comp, GT_EQ, TYP_INT, l1Indir, r1Indir);
GenTree* eq2 = newBinaryOp(comp, GT_EQ, TYP_INT, l2Indir, r2Indir);
result = newBinaryOp(comp, GT_AND, TYP_INT, eq1, eq2);

BlockRange().InsertAfter(r2Indir, eq1, eq2, result);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, it should work for SIMD as well and then be optimized: #89722

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left a TODO-CQ about it below, presumably both the int/SIMD paths can benefit on xarch (while only SIMD on arm64).
I was thinking we could model it after/combine it with TryLowerAndOrToCCMP, although I can see your prototype did it on tree form which probably makes it simpler

}
#endif

if (result == nullptr)
{
// We're going to emit something like the following:
//
// bool result = ((*(int*)leftArg ^ *(int*)rightArg) |
// (*(int*)(leftArg + 1) ^ *((int*)(rightArg + 1)))) == 0;
//
// ^ in the given example we unroll for length=5
//
// In IR:
//
// * EQ int
// +--* OR int
// | +--* XOR int
// | | +--* IND int
// | | | \--* LCL_VAR byref V1
// | | \--* IND int
// | | \--* LCL_VAR byref V2
// | \--* XOR int
// | +--* IND int
// | | \--* ADD byref
// | | +--* LCL_VAR byref V1
// | | \--* CNS_INT int 1
// | \--* IND int
// | \--* ADD byref
// | +--* LCL_VAR byref V2
// | \--* CNS_INT int 1
// \--* CNS_INT int 0
//
// TODO-CQ: Do this as a general optimization similar to TryLowerAndOrToCCMP.

GenTree* lXor = newBinaryOp(comp, GT_XOR, actualLoadType, l1Indir, r1Indir);
GenTree* rXor = newBinaryOp(comp, GT_XOR, actualLoadType, l2Indir, r2Indir);
GenTree* resultOr = newBinaryOp(comp, GT_OR, actualLoadType, lXor, rXor);
GenTree* zeroCns = comp->gtNewZeroConNode(actualLoadType);
result = newBinaryOp(comp, GT_EQ, TYP_INT, resultOr, zeroCns);

BlockRange().InsertAfter(r2Indir, lXor, rXor, resultOr, zeroCns);
BlockRange().InsertAfter(zeroCns, result);
}
}

JITDUMP("\nUnrolled to:\n");
Expand All @@ -2090,7 +2119,7 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
arg.GetNode()->SetUnusedValue();
}
}
return lArg;
return true;
}
}
else
Expand All @@ -2102,7 +2131,7 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
{
JITDUMP("size is not a constant.\n")
}
return nullptr;
return false;
}

// do lowering steps for a call
Expand Down Expand Up @@ -2133,20 +2162,12 @@ GenTree* Lowering::LowerCall(GenTree* node)
#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC)
{
GenTree* newNode = nullptr;
NamedIntrinsic ni = comp->lookupNamedIntrinsic(call->gtCallMethHnd);
if (ni == NI_System_Buffer_Memmove)
{
newNode = LowerCallMemmove(call);
}
else if (ni == NI_System_SpanHelpers_SequenceEqual)
GenTree* nextNode = nullptr;
NamedIntrinsic ni = comp->lookupNamedIntrinsic(call->gtCallMethHnd);
if (((ni == NI_System_Buffer_Memmove) && LowerCallMemmove(call, &nextNode)) ||
((ni == NI_System_SpanHelpers_SequenceEqual) && LowerCallMemcmp(call, &nextNode)))
{
newNode = LowerCallMemcmp(call);
}

if (newNode != nullptr)
{
return newNode->gtNext;
return nextNode;
}
}
#endif
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ class Lowering final : public Phase
// Call Lowering
// ------------------------------
GenTree* LowerCall(GenTree* call);
GenTree* LowerCallMemmove(GenTreeCall* call);
GenTree* LowerCallMemcmp(GenTreeCall* call);
bool LowerCallMemmove(GenTreeCall* call, GenTree** next);
bool LowerCallMemcmp(GenTreeCall* call, GenTree** next);
void LowerCFGCall(GenTreeCall* call);
void MoveCFGCallArg(GenTreeCall* call, GenTree* node);
#ifndef TARGET_64BIT
Expand Down