Skip to content

Commit

Permalink
JIT: Check for potential store-to-load forwarding before reordering l…
Browse files Browse the repository at this point in the history
…dr -> ldp

Very targeted fix for dotnet#93401 and dotnet#101437: before reordering two
indirections, check if there is a potential store in the same loop that
looks like it could end up being a candidate for store-to-load
forwarding into one of those indirections. Some hardware does not handle
store-to-load forwarding with the same fidelity when `stp`/`ldp` is
involved compared to multiple `str`/`ldr`.

If we detect the situation then avoid doing the reordering.
  • Loading branch information
jakobbotsch committed Jul 30, 2024
1 parent 5d6f36f commit 2b1a2b8
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 47 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6238,6 +6238,7 @@ class Compiler
PhaseStatus fgUpdateFlowGraphPhase();

PhaseStatus fgDfsBlocksAndRemove();
bool fgRemoveBlocksOutsideDfsTree();

PhaseStatus fgFindOperOrder();

Expand Down
98 changes: 54 additions & 44 deletions src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5742,72 +5742,82 @@ PhaseStatus Compiler::fgDfsBlocksAndRemove()
fgInvalidateDfsTree();
m_dfsTree = fgComputeDfs();

PhaseStatus status = PhaseStatus::MODIFIED_NOTHING;
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
return fgRemoveBlocksOutsideDfsTree() ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING;
}

//-------------------------------------------------------------
// fgRemoveBlocksOutsideDfsTree: Remove the blocks that are not in the current DFS tree.
//
// Returns:
// True if any block was removed.
//
bool Compiler::fgRemoveBlocksOutsideDfsTree()
{
if (m_dfsTree->GetPostOrderCount() == fgBBcount)
{
return false;
}

#ifdef DEBUG
if (verbose)
if (verbose)
{
printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(),
fgBBcount);
for (BasicBlock* block : Blocks())
{
printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(),
fgBBcount);
for (BasicBlock* block : Blocks())
if (!m_dfsTree->Contains(block))
{
if (!m_dfsTree->Contains(block))
{
printf(" " FMT_BB "\n", block->bbNum);
}
printf(" " FMT_BB "\n", block->bbNum);
}
}
}
#endif // DEBUG

// The DFS we run is not precise around call-finally, so
// `fgRemoveUnreachableBlocks` can expose newly unreachable blocks
// that we did not uncover during the DFS. If we did remove any
// call-finally blocks then iterate to closure. This is a very rare
// case.
while (true)
{
bool anyCallFinallyPairs = false;
fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) {
if (!m_dfsTree->Contains(block))
{
anyCallFinallyPairs |= block->isBBCallFinallyPair();
return true;
}

return false;
});

if (!anyCallFinallyPairs)
// The DFS we run is not precise around call-finally, so
// `fgRemoveUnreachableBlocks` can expose newly unreachable blocks
// that we did not uncover during the DFS. If we did remove any
// call-finally blocks then iterate to closure. This is a very rare
// case.
while (true)
{
bool anyCallFinallyPairs = false;
fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) {
if (!m_dfsTree->Contains(block))
{
break;
anyCallFinallyPairs |= block->isBBCallFinallyPair();
return true;
}

m_dfsTree = fgComputeDfs();
return false;
});

if (!anyCallFinallyPairs)
{
break;
}

m_dfsTree = fgComputeDfs();
}

#ifdef DEBUG
// Did we actually remove all the blocks we said we were going to?
if (verbose)
// Did we actually remove all the blocks we said we were going to?
if (verbose)
{
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
{
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount());
for (BasicBlock* block : Blocks())
{
printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount());
for (BasicBlock* block : Blocks())
if (!m_dfsTree->Contains(block))
{
if (!m_dfsTree->Contains(block))
{
printf(" " FMT_BB "\n", block->bbNum);
}
printf(" " FMT_BB "\n", block->bbNum);
}
}
}
#endif // DEBUG

status = PhaseStatus::MODIFIED_EVERYTHING;
}
#endif // DEBUG

return status;
return true;
}

//-------------------------------------------------------------
Expand Down
204 changes: 201 additions & 3 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1295,6 +1295,8 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
switchBBRange.Remove(node->AsOp()->gtOp1);
switchBBRange.Remove(node);

comp->fgInvalidateDfsTree();

return next;
}

Expand Down Expand Up @@ -7695,9 +7697,17 @@ PhaseStatus Lowering::DoPhase()
const bool setSlotNumbers = false;
comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);

// Remove dead blocks and compute DFS (we want to remove unreachable blocks
// even in MinOpts).
comp->fgDfsBlocksAndRemove();
if (comp->m_dfsTree == nullptr)
{
// Compute DFS tree. We want to remove dead blocks even in MinOpts, so we
// do this everywhere. The dead blocks are removed below, however, some of
// lowering may use the DFS tree, so we compute that here.
comp->m_dfsTree = comp->fgComputeDfs();
}

// Remove dead blocks. We want to remove unreachable blocks even in
// MinOpts.
comp->fgRemoveBlocksOutsideDfsTree();

if (comp->backendRequiresLocalVarLifetimes())
{
Expand Down Expand Up @@ -9519,6 +9529,17 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
}
}

// For some hardware the ldr -> ldp transformation can result in missed
// store-to-load forwarding opportunities, which can seriously harm
// performance. Here we do a best effort check to see if one of the loads
// we are combining may be loading from a store that reaches the load
// without redefining the address.
if (prevIndir->OperIsLoad() && indir->OperIsLoad() && IsStoreToLoadForwardingCandidateInLoop(prevIndir, indir))
{
JITDUMP("Avoiding making indirs adjacent; this may be the target of a store-to-load forwarding candidate\n");
return false;
}

JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir));

GenTree* previous = prevIndir;
Expand Down Expand Up @@ -9565,6 +9586,183 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
return true;
}

//------------------------------------------------------------------------
// IsStoreToLoadForwardingCandidateInLoop: Check if one of the specified
// indirections may be the target of store-to-load forwarding from an indirect
// store that reaches one of the loads and that happens within the same loop.
// In those cases the transformation to 'ldp' can break this hardware
// optimization for some hardware.
//
// Arguments:
// prevIndir - First indirection
// indir - Second indirection
//
// Returns:
// True if so.
//
bool Lowering::IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir)
{
if (comp->m_dfsTree == nullptr)
{
comp->m_dfsTree = comp->fgComputeDfs();
}

if (!comp->m_dfsTree->HasCycle())
{
return false;
}

if (comp->m_loops == nullptr)
{
comp->m_loops = FlowGraphNaturalLoops::Find(comp->m_dfsTree);
comp->m_blockToLoop = BlockToNaturalLoopMap::Build(comp->m_loops);
}

FlowGraphNaturalLoop* loop = comp->m_blockToLoop->GetLoop(m_block);
if (loop == nullptr)
{
return false;
}

GenTree* addr1 = prevIndir->Addr();
target_ssize_t offs1;
comp->gtPeelOffsets(&addr1, &offs1);
unsigned lcl1 = addr1->OperIs(GT_LCL_VAR) ? addr1->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM;

GenTree* addr2 = indir->Addr();
target_ssize_t offs2;
comp->gtPeelOffsets(&addr2, &offs2);
unsigned lcl2 = addr1->OperIs(GT_LCL_VAR) ? addr2->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM;

unsigned budget = 100;

// Starting at an end node, go backwards until the specified first node and look for
// 1) Definitions of the base address local, which invalidates future store-to-load forwarding
// 2) Stores to the same address as is being loaded later, which allows store-to-load forwarding
auto checkNodes = [=, &budget](GenTree* lastNode, GenTree* firstNode, bool* hasStore, bool* hasDef) {
*hasStore = false;
*hasDef = false;

for (GenTree* curNode = lastNode;; curNode = curNode->gtPrev)
{
if (curNode->OperIs(GT_STORE_LCL_VAR))
{
unsigned lclNum = curNode->AsLclVarCommon()->GetLclNum();
if ((lclNum == lcl1) || (lclNum == lcl2))
{
*hasDef = true;
return true;
}
}
else if (curNode->OperIs(GT_STOREIND))
{
GenTreeIndir* storeInd = curNode->AsIndir();
GenTree* storeIndirAddr = storeInd->Addr();
target_ssize_t storeIndirOffs;
comp->gtPeelOffsets(&storeIndirAddr, &storeIndirOffs);

if (storeIndirAddr->OperIs(GT_LCL_VAR) && ((storeIndirOffs == offs1) || (storeIndirOffs == offs2)))
{
unsigned storeIndirAddrLcl = storeIndirAddr->AsLclVarCommon()->GetLclNum();
if ((storeIndirAddrLcl == lcl1) || (storeIndirAddrLcl == lcl2))
{
JITDUMP("Store at [%06u] may allow store-to-load forwarding of indir [%06u]\n",
Compiler::dspTreeID(curNode),
Compiler::dspTreeID(storeIndirAddrLcl == lcl1 ? prevIndir : indir));

*hasStore = true;
return true;
}
}
}

if (curNode == firstNode)
{
break;
}

if (--budget == 0)
{
return false;
}
}

return true;
};

bool hasStore;
bool hasDef;
if (!checkNodes(prevIndir, LIR::AsRange(m_block).FirstNode(), &hasStore, &hasDef))
{
// Out of budget
return false;
}

if (hasStore)
{
// Have a store before the indir; it could be store-to-load forwarded.
return true;
}

if (hasDef)
{
// Have a def before the indir; it would break store-to-load
// forwarding. No preds to push then, so we are done.
return false;
}

// Now we've checked range before the indirs; continue with its preds
// inside the loop. We will check the range after the indirs once we get to
// it.
BitVecTraits traits = comp->m_dfsTree->PostOrderTraits();
BitVec visited(BitVecOps::MakeEmpty(&traits));

ArrayStack<BasicBlock*> stack(comp->getAllocator(CMK_ArrayStack));

auto pushPreds = [=, &traits, &visited, &stack](BasicBlock* block) {
for (BasicBlock* pred : block->PredBlocks())
{
if (loop->ContainsBlock(pred) && BitVecOps::TryAddElemD(&traits, visited, pred->bbPostorderNum))
{
stack.Push(pred);
}
}
};

pushPreds(m_block);

while (stack.Height() > 0)
{
BasicBlock* block = stack.Pop();

LIR::Range& range = LIR::AsRange(block);

GenTree* firstNode = block == m_block ? prevIndir : range.FirstNode();

if ((firstNode != nullptr) && !checkNodes(range.LastNode(), firstNode, &hasStore, &hasDef))
{
// Out of budget
return false;
}

if (hasStore)
{
// This would be store-to-load forwardable.
return true;
}

if (hasDef)
{
// Redefinition of base local; skip pushing preds
continue;
}

pushPreds(block);
}

return false;
}

//------------------------------------------------------------------------
// MarkTree: Mark trees involved in the computation of 'node' recursively.
//
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ class Lowering final : public Phase
GenTree* LowerIndir(GenTreeIndir* ind);
bool OptimizeForLdpStp(GenTreeIndir* ind);
bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir);
bool IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir);
bool TryMoveAddSubRMWAfterIndir(GenTreeLclVarCommon* store);
bool TryMakeIndirAndStoreAdjacent(GenTreeIndir* prevIndir, GenTreeLclVarCommon* store);
void MarkTree(GenTree* root);
Expand Down

0 comments on commit 2b1a2b8

Please sign in to comment.