Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slightly improve MinOpts JIT TP #105250

Merged
merged 10 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5273,7 +5273,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl

#ifdef DEBUG
// Stash the current estimate of the function's size if necessary.
if (verbose)
if (verbose && opts.OptimizationEnabled())
{
compSizeEstimate = 0;
compCycleEstimate = 0;
Expand Down
19 changes: 17 additions & 2 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3655,6 +3655,7 @@ class Compiler
bool gtMarkAddrMode(GenTree* addr, int* costEx, int* costSz, var_types type);

unsigned gtSetEvalOrder(GenTree* tree);
unsigned gtSetEvalOrderMinOpts(GenTree* tree);
bool gtMayHaveStoreInterference(GenTree* treeWithStores, GenTree* tree);
bool gtTreeHasLocalRead(GenTree* tree, unsigned lclNum);

Expand Down Expand Up @@ -9985,6 +9986,8 @@ class Compiler
// Maximum number of locals before turning off the inlining
#define MAX_LV_NUM_COUNT_FOR_INLINING 512

bool canUseTier0Opts;
bool canUseAllOpts;
bool compMinOpts;
bool compMinOptsIsSet;
#ifdef DEBUG
Expand All @@ -10011,13 +10014,22 @@ class Compiler
}
#endif // !DEBUG

// TODO: we should convert these into a single OptimizationLevel

bool OptimizationDisabled() const
{
return MinOpts() || compDbgCode;
assert(compMinOptsIsSet);
return !canUseAllOpts;
}
bool OptimizationEnabled() const
{
return !OptimizationDisabled();
assert(compMinOptsIsSet);
return canUseAllOpts;
}
bool Tier0OptimizationEnabled() const
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A minor nit is that OptimizationEnabled and Tier0OptimizationEnabled both existing may be a bit confusing. Is there any idea what we would call the levels if this were some kind of OptimizationLevel enum, just for ideas on what we could call it as an alternative?

I think this is fine for now, because getting better names is a more involved change; just wanted to check if there were any other ideas/suggestions for the interim

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I agree that it doesn't look pretty, but a proper refactoring will be quite big and hard to review

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any idea what we would call the levels if this were some kind of OptimizationLevel enum, just for ideas on what we could call it as an alternative?

Not yet. Andy suggested to use OptLevel: NoOpts, Size, Speed where Size implies "light-weight". The problem that we already have optLevel in jit, but we never use it (it's always Blended) so it all needs some re-thinking

{
assert(compMinOptsIsSet);
return canUseTier0Opts;
}

void SetMinOpts(bool val)
Expand All @@ -10026,6 +10038,9 @@ class Compiler
assert(!compMinOptsIsSet || (compMinOpts == val));
compMinOpts = val;
compMinOptsIsSet = true;

canUseTier0Opts = !compDbgCode && !jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);
canUseAllOpts = canUseTier0Opts && !val;
}

// true if the CLFLG_* for an optimization is set.
Expand Down
217 changes: 198 additions & 19 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3931,8 +3931,10 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
int costSz = 1;
unsigned level = 0;

bool optsEnabled = opts.OptimizationEnabled();

#if defined(FEATURE_HW_INTRINSICS)
if (multiOp->OperIs(GT_HWINTRINSIC))
if (multiOp->OperIs(GT_HWINTRINSIC) && optsEnabled)
{
GenTreeHWIntrinsic* hwTree = multiOp->AsHWIntrinsic();
#if defined(TARGET_XARCH)
Expand Down Expand Up @@ -4052,8 +4054,12 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
level += 1;
}

costEx += (multiOp->Op(1)->GetCostEx() + multiOp->Op(2)->GetCostEx());
costSz += (multiOp->Op(1)->GetCostSz() + multiOp->Op(2)->GetCostSz());
if (optsEnabled)
{
// We don't need/have costs in MinOpts
costEx += (multiOp->Op(1)->GetCostEx() + multiOp->Op(2)->GetCostEx());
costSz += (multiOp->Op(1)->GetCostSz() + multiOp->Op(2)->GetCostSz());
}
}
else
{
Expand All @@ -4064,12 +4070,19 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)

level = max(lvl, level + 1);

costEx += op->GetCostEx();
costSz += op->GetCostSz();
if (optsEnabled)
{
// We don't need/have costs in MinOpts
costEx += op->GetCostEx();
costSz += op->GetCostSz();
}
}
}

multiOp->SetCosts(costEx, costSz);
if (optsEnabled)
{
multiOp->SetCosts(costEx, costSz);
}
return level;
}
#endif
Expand Down Expand Up @@ -4848,6 +4861,11 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
{
assert(tree);

if (opts.OptimizationDisabled())
{
return gtSetEvalOrderMinOpts(tree);
}

#ifdef DEBUG
/* Clear the GTF_DEBUG_NODE_MORPHED flag as well */
tree->gtDebugFlags &= ~GTF_DEBUG_NODE_MORPHED;
Expand Down Expand Up @@ -6212,6 +6230,176 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
#pragma warning(pop)
#endif

//------------------------------------------------------------------------
// gtSetEvalOrderMinOpts: A MinOpts specific version of gtSetEvalOrder. We don't
// need to set costs, but we're looking for opportunities to swap operands.
//
// Arguments:
// tree - The tree for which we are setting the evaluation order.
//
// Return Value:
// the Sethi 'complexity' estimate for this tree (the higher
// the number, the higher is the tree's resources requirement)
//
unsigned Compiler::gtSetEvalOrderMinOpts(GenTree* tree)
{
assert(tree);
if (fgOrder == FGOrderLinear)
{
// We don't re-order operands in LIR anyway.
return 0;
Comment on lines +6261 to +6262
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it ever get called?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it ever get called?

Not sure, but the original function gtSetEvalOrder contained this check

}

if (tree->OperIsLeaf())
{
// Nothing to do for leaves, report as having Sethi 'complexity' of 0
return 0;
}

unsigned level = 1;
if (tree->OperIsSimple())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Might be worth a small comment saying that we are intentionally not handling hwintrinsics (they otherwise normally support GTF_REVERSE_OPS and other considerations).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a comment, but we skip a lot of trees here, it was driven by asmdiff + tpdiff results

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but we skip a lot of trees here

👍, had mostly thought it might be worth a callout because it normally mirrors a lot of the SimpleOp handling (unlike any other node), so figured it might be worth a special note/callout

{
GenTree* op1 = tree->AsOp()->gtOp1;
GenTree* op2 = tree->gtGetOp2IfPresent();

// Only GT_LEA may have a nullptr op1 and a non-nullptr op2
if (tree->OperIs(GT_LEA) && (op1 == nullptr))
{
std::swap(op1, op2);
}

// Check for a nilary operator
if (op1 == nullptr)
{
assert(op2 == nullptr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what cases do we get here?

Copy link
Member Author

@EgorBo EgorBo Jul 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's also copied from the original function. Looks like GT_RETURN for void is so (and RETFILT)

return level;
}

if (op2 == nullptr)
{
gtSetEvalOrderMinOpts(op1);
return level;
}

level = gtSetEvalOrderMinOpts(op1);
unsigned levelOp2 = gtSetEvalOrderMinOpts(op2);

bool allowSwap = true;
// TODO: Introduce a function to check whether we can swap the order of its operands or not.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this TODO be removed? Not sure I understand what it refers to. If it refers to extract the below logic into a function, why not just do that in this PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd better remove the comment then for now, I don't mind. The idea is to share quirks with the Tier1 version (esp for STOREIND/STORE_BLK)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be useful enough to leave a comment that it should be kept in sync with the quirks in the tier 1 version (and likewise in the tier 1 version)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved both to a single helper, to keep the quirks in one place

switch (tree->OperGet())
{
case GT_COMMA:
case GT_BOUNDS_CHECK:
case GT_INTRINSIC:
case GT_QMARK:
case GT_COLON:
// We're not going to swap operands in these
allowSwap = false;
break;

case GT_STORE_BLK:
case GT_STOREIND:
{
if (op1->IsInvariant())
{
allowSwap = false;
tree->SetReverseOp();
break;
}
if ((op1->gtFlags & GTF_ALL_EFFECT) != 0)
{
break;
}

// In case op2 assigns to a local var that is used in op1, we have to evaluate op1 first.
if (gtMayHaveStoreInterference(op2, op1))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably don't need this expensive version in MinOpts. It can be the conservative (op2->gtFlags & GTF_ASG) != 0 that existed before #97409.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably don't need this expensive version in MinOpts. It can be the conservative (op2->gtFlags & GTF_ASG) != 0 that existed before #97409.

just tried - it adds +28k bytes size regression for benchmarks.run_pgo collection

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, fine to leave it then. (Actually looks like I called out explicitly doing it in MinOpts in that PR as well)

{
// TODO-ASG-Cleanup: move this guard to "gtCanSwapOrder".
allowSwap = false;
break;
}

// If op2 is simple then evaluate op1 first
if (op2->OperIsLeaf())
{
break;
}

allowSwap = false;
tree->SetReverseOp();
break;
}

default:
break;
}

const bool shouldSwap = tree->IsReverseOp() ? level > levelOp2 : level < levelOp2;
if (shouldSwap && allowSwap)
{
// Can we swap the order by commuting the operands?
const bool canSwap = tree->IsReverseOp() ? gtCanSwapOrder(op2, op1) : gtCanSwapOrder(op1, op2);
if (canSwap)
{
if (tree->OperIsCmpCompare())
{
genTreeOps oper = tree->OperGet();
if (GenTree::SwapRelop(oper) != oper)
{
tree->SetOper(GenTree::SwapRelop(oper));
}
std::swap(tree->AsOp()->gtOp1, tree->AsOp()->gtOp2);
}
else if (tree->OperIsCommutative())
{
std::swap(tree->AsOp()->gtOp1, tree->AsOp()->gtOp2);
}
else
{
// Mark the operand's evaluation order to be swapped.
tree->gtFlags ^= GTF_REVERSE_OPS;
}
}
}

// Swap the level counts
if (tree->IsReverseOp())
{
std::swap(level, levelOp2);
}

// Compute the sethi number for this binary operator
if (level < 1)
{
level = levelOp2;
}
else if (level == levelOp2)
{
level++;
}
}
else if (tree->IsCall())
{
// We ignore late args - they don't bring any noticeable benefits
// according to asmdiffs/tpdiff
for (CallArg& arg : tree->AsCall()->gtArgs.EarlyArgs())
{
gtSetEvalOrderMinOpts(arg.GetEarlyNode());
}
level = 3;
}
#if defined(FEATURE_HW_INTRINSICS)
else if (tree->OperIsHWIntrinsic())
{
return gtSetMultiOpOrder(tree->AsMultiOp());
}
#endif // FEATURE_HW_INTRINSICS

// NOTE: we skip many operators here in order to maintain a good trade-off between CQ and TP.

return level;
}

//------------------------------------------------------------------------
// gtMayHaveStoreInterference: Check if two trees may interfere because of a
// store in one of the trees.
Expand Down Expand Up @@ -13340,10 +13528,7 @@ GenTree* Compiler::gtFoldExpr(GenTree* tree)
return tree;
}

// NOTE: MinOpts() is always true for Tier0 so we have to check explicit flags instead.
// To be fixed in https://github.com/dotnet/runtime/pull/77465
const bool tier0opts = !opts.compDbgCode && !opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);
if (!tier0opts)
if (!opts.Tier0OptimizationEnabled())
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
{
return tree;
}
Expand Down Expand Up @@ -13406,7 +13591,7 @@ GenTree* Compiler::gtFoldExpr(GenTree* tree)
// special operator that can use only one constant
// to fold - e.g. booleans

if (tier0opts && opts.OptimizationDisabled())
if (opts.OptimizationDisabled())
{
// Too heavy for tier0
return tree;
Expand Down Expand Up @@ -15197,10 +15382,7 @@ GenTree* Compiler::gtFoldExprConst(GenTree* tree)
GenTree* op1 = tree->gtGetOp1();
GenTree* op2 = tree->gtGetOp2IfPresent();

// NOTE: MinOpts() is always true for Tier0 so we have to check explicit flags instead.
// To be fixed in https://github.com/dotnet/runtime/pull/77465
const bool tier0opts = !opts.compDbgCode && !opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);
if (!tier0opts)
if (!opts.Tier0OptimizationEnabled())
{
return tree;
}
Expand Down Expand Up @@ -30267,10 +30449,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
{
assert(tree->OperIsHWIntrinsic());

// NOTE: MinOpts() is always true for Tier0 so we have to check explicit flags instead.
// To be fixed in https://github.com/dotnet/runtime/pull/77465
const bool tier0opts = !opts.compDbgCode && !opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);
if (!tier0opts)
if (!opts.Tier0OptimizationEnabled())
{
return tree;
}
Expand Down
8 changes: 2 additions & 6 deletions src/coreclr/jit/importercalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,7 +1553,7 @@ GenTree* Compiler::impThrowIfNull(GenTreeCall* call)
assert(call->gtArgs.CountUserArgs() == 2);
assert(call->TypeIs(TYP_VOID));

if (opts.compDbgCode || opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT))
if (!opts.Tier0OptimizationEnabled())
{
// Don't fold it for debug code or forced MinOpts
return call;
Expand Down Expand Up @@ -3302,11 +3302,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd,

// Allow some lighweight intrinsics in Tier0 which can improve throughput
// we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
// NOTE: MinOpts() is always true for Tier0 so we have to check explicit flags instead.
// To be fixed in https://github.com/dotnet/runtime/pull/77465
const bool tier0opts = !opts.compDbgCode && !opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);

if (!mustExpand && tier0opts)
if (!mustExpand && opts.Tier0OptimizationEnabled())
{
switch (ni)
{
Expand Down
10 changes: 8 additions & 2 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ void CallArgs::ArgsComplete(Compiler* comp, GenTreeCall* call)
// TODO-CQ: handle HWI/SIMD/COMMA nodes in multi-reg morphing.
SetNeedsTemp(&arg);
}
else
else if (comp->opts.OptimizationEnabled())
{
// Finally, we call gtPrepareCost to measure the cost of evaluating this tree.
comp->gtPrepareCost(argx);
Expand Down Expand Up @@ -1476,7 +1476,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
assert(begTab == endTab);
break;
}
else
else if (comp->opts.OptimizationEnabled())
{
if (!costsPrepared)
{
Expand All @@ -1492,6 +1492,12 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
expensiveArg = arg;
}
}
else
{
// We don't have cost information in MinOpts
expensiveArgIndex = curInx;
expensiveArg = arg;
}
}
}

Expand Down
Loading
Loading