From 69c9bf8aa33c63478ec9f4e7903d449474add468 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Thu, 1 Aug 2024 14:32:48 +0200 Subject: [PATCH] JIT: Check for potential store-to-load forwarding before reordering ldr -> ldp (#105695) Very targeted fix for #93401 and #101437: before reordering two indirections, check if there is a potential store in the same loop that looks like it could end up being a candidate for store-to-load forwarding into one of those indirections. Some hardware does not handle store-to-load forwarding with the same fidelity when `stp`/`ldp` is involved compared to multiple `str`/`ldr`. The detection is done by a graph walk that starts at the indirection and then walks backwards until it finds a store that would reach the indirection. The walk is limited to stay within the same loop as the indirections, and also limited by a budget of 100 nodes visited (for a large loop we expect this to not be as important). If we detect the situation then avoid doing the reordering. Fix #93401 Fix #101437 --- src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/fgopt.cpp | 98 ++++++++++-------- src/coreclr/jit/lower.cpp | 204 ++++++++++++++++++++++++++++++++++++- src/coreclr/jit/lower.h | 1 + 4 files changed, 257 insertions(+), 47 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 24fd48b2ef3b7..a26aabda27926 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6238,6 +6238,7 @@ class Compiler PhaseStatus fgUpdateFlowGraphPhase(); PhaseStatus fgDfsBlocksAndRemove(); + bool fgRemoveBlocksOutsideDfsTree(); PhaseStatus fgFindOperOrder(); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 9e11b12c091f3..22b1e7127a36e 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5742,72 +5742,82 @@ PhaseStatus Compiler::fgDfsBlocksAndRemove() fgInvalidateDfsTree(); m_dfsTree = fgComputeDfs(); - PhaseStatus status = PhaseStatus::MODIFIED_NOTHING; - if (m_dfsTree->GetPostOrderCount() != fgBBcount) + return fgRemoveBlocksOutsideDfsTree() ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; +} + +//------------------------------------------------------------- +// fgRemoveBlocksOutsideDfsTree: Remove the blocks that are not in the current DFS tree. +// +// Returns: +// True if any block was removed. +// +bool Compiler::fgRemoveBlocksOutsideDfsTree() +{ + if (m_dfsTree->GetPostOrderCount() == fgBBcount) { + return false; + } + #ifdef DEBUG - if (verbose) + if (verbose) + { + printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(), + fgBBcount); + for (BasicBlock* block : Blocks()) { - printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(), - fgBBcount); - for (BasicBlock* block : Blocks()) + if (!m_dfsTree->Contains(block)) { - if (!m_dfsTree->Contains(block)) - { - printf(" " FMT_BB "\n", block->bbNum); - } + printf(" " FMT_BB "\n", block->bbNum); } } + } #endif // DEBUG - // The DFS we run is not precise around call-finally, so - // `fgRemoveUnreachableBlocks` can expose newly unreachable blocks - // that we did not uncover during the DFS. If we did remove any - // call-finally blocks then iterate to closure. This is a very rare - // case. - while (true) - { - bool anyCallFinallyPairs = false; - fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) { - if (!m_dfsTree->Contains(block)) - { - anyCallFinallyPairs |= block->isBBCallFinallyPair(); - return true; - } - - return false; - }); - - if (!anyCallFinallyPairs) + // The DFS we run is not precise around call-finally, so + // `fgRemoveUnreachableBlocks` can expose newly unreachable blocks + // that we did not uncover during the DFS. If we did remove any + // call-finally blocks then iterate to closure. This is a very rare + // case. + while (true) + { + bool anyCallFinallyPairs = false; + fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) { + if (!m_dfsTree->Contains(block)) { - break; + anyCallFinallyPairs |= block->isBBCallFinallyPair(); + return true; } - m_dfsTree = fgComputeDfs(); + return false; + }); + + if (!anyCallFinallyPairs) + { + break; } + m_dfsTree = fgComputeDfs(); + } + #ifdef DEBUG - // Did we actually remove all the blocks we said we were going to? - if (verbose) + // Did we actually remove all the blocks we said we were going to? + if (verbose) + { + if (m_dfsTree->GetPostOrderCount() != fgBBcount) { - if (m_dfsTree->GetPostOrderCount() != fgBBcount) + printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount()); + for (BasicBlock* block : Blocks()) { - printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount()); - for (BasicBlock* block : Blocks()) + if (!m_dfsTree->Contains(block)) { - if (!m_dfsTree->Contains(block)) - { - printf(" " FMT_BB "\n", block->bbNum); - } + printf(" " FMT_BB "\n", block->bbNum); } } } -#endif // DEBUG - - status = PhaseStatus::MODIFIED_EVERYTHING; } +#endif // DEBUG - return status; + return true; } //------------------------------------------------------------- diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 2f155fad0a59b..3fd18d3e8ae38 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1295,6 +1295,8 @@ GenTree* Lowering::LowerSwitch(GenTree* node) switchBBRange.Remove(node->AsOp()->gtOp1); switchBBRange.Remove(node); + comp->fgInvalidateDfsTree(); + return next; } @@ -7699,9 +7701,17 @@ PhaseStatus Lowering::DoPhase() const bool setSlotNumbers = false; comp->lvaComputeRefCounts(isRecompute, setSlotNumbers); - // Remove dead blocks and compute DFS (we want to remove unreachable blocks - // even in MinOpts). - comp->fgDfsBlocksAndRemove(); + if (comp->m_dfsTree == nullptr) + { + // Compute DFS tree. We want to remove dead blocks even in MinOpts, so we + // do this everywhere. The dead blocks are removed below, however, some of + // lowering may use the DFS tree, so we compute that here. + comp->m_dfsTree = comp->fgComputeDfs(); + } + + // Remove dead blocks. We want to remove unreachable blocks even in + // MinOpts. + comp->fgRemoveBlocksOutsideDfsTree(); if (comp->backendRequiresLocalVarLifetimes()) { @@ -9523,6 +9533,17 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi } } + // For some hardware the ldr -> ldp transformation can result in missed + // store-to-load forwarding opportunities, which can seriously harm + // performance. Here we do a best effort check to see if one of the loads + // we are combining may be loading from a store that reaches the load + // without redefining the address. + if (prevIndir->OperIsLoad() && indir->OperIsLoad() && IsStoreToLoadForwardingCandidateInLoop(prevIndir, indir)) + { + JITDUMP("Avoiding making indirs adjacent; this may be the target of a store-to-load forwarding candidate\n"); + return false; + } + JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir)); GenTree* previous = prevIndir; @@ -9569,6 +9590,183 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi return true; } +//------------------------------------------------------------------------ +// IsStoreToLoadForwardingCandidateInLoop: Check if one of the specified +// indirections may be the target of store-to-load forwarding from an indirect +// store that reaches one of the loads and that happens within the same loop. +// In those cases the transformation to 'ldp' can break this hardware +// optimization for some hardware. +// +// Arguments: +// prevIndir - First indirection +// indir - Second indirection +// +// Returns: +// True if so. +// +bool Lowering::IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir) +{ + if (comp->m_dfsTree == nullptr) + { + comp->m_dfsTree = comp->fgComputeDfs(); + } + + if (!comp->m_dfsTree->HasCycle()) + { + return false; + } + + if (comp->m_loops == nullptr) + { + comp->m_loops = FlowGraphNaturalLoops::Find(comp->m_dfsTree); + comp->m_blockToLoop = BlockToNaturalLoopMap::Build(comp->m_loops); + } + + FlowGraphNaturalLoop* loop = comp->m_blockToLoop->GetLoop(m_block); + if (loop == nullptr) + { + return false; + } + + GenTree* addr1 = prevIndir->Addr(); + target_ssize_t offs1; + comp->gtPeelOffsets(&addr1, &offs1); + unsigned lcl1 = addr1->OperIs(GT_LCL_VAR) ? addr1->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM; + + GenTree* addr2 = indir->Addr(); + target_ssize_t offs2; + comp->gtPeelOffsets(&addr2, &offs2); + unsigned lcl2 = addr1->OperIs(GT_LCL_VAR) ? addr2->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM; + + unsigned budget = 100; + + // Starting at an end node, go backwards until the specified first node and look for + // 1) Definitions of the base address local, which invalidates future store-to-load forwarding + // 2) Stores to the same address as is being loaded later, which allows store-to-load forwarding + auto checkNodes = [=, &budget](GenTree* lastNode, GenTree* firstNode, bool* hasStore, bool* hasDef) { + *hasStore = false; + *hasDef = false; + + for (GenTree* curNode = lastNode;; curNode = curNode->gtPrev) + { + if (curNode->OperIs(GT_STORE_LCL_VAR)) + { + unsigned lclNum = curNode->AsLclVarCommon()->GetLclNum(); + if ((lclNum == lcl1) || (lclNum == lcl2)) + { + *hasDef = true; + return true; + } + } + else if (curNode->OperIs(GT_STOREIND)) + { + GenTreeIndir* storeInd = curNode->AsIndir(); + GenTree* storeIndirAddr = storeInd->Addr(); + target_ssize_t storeIndirOffs; + comp->gtPeelOffsets(&storeIndirAddr, &storeIndirOffs); + + if (storeIndirAddr->OperIs(GT_LCL_VAR) && ((storeIndirOffs == offs1) || (storeIndirOffs == offs2))) + { + unsigned storeIndirAddrLcl = storeIndirAddr->AsLclVarCommon()->GetLclNum(); + if ((storeIndirAddrLcl == lcl1) || (storeIndirAddrLcl == lcl2)) + { + JITDUMP("Store at [%06u] may allow store-to-load forwarding of indir [%06u]\n", + Compiler::dspTreeID(curNode), + Compiler::dspTreeID(storeIndirAddrLcl == lcl1 ? prevIndir : indir)); + + *hasStore = true; + return true; + } + } + } + + if (curNode == firstNode) + { + break; + } + + if (--budget == 0) + { + return false; + } + } + + return true; + }; + + bool hasStore; + bool hasDef; + if (!checkNodes(prevIndir, LIR::AsRange(m_block).FirstNode(), &hasStore, &hasDef)) + { + // Out of budget + return false; + } + + if (hasStore) + { + // Have a store before the indir; it could be store-to-load forwarded. + return true; + } + + if (hasDef) + { + // Have a def before the indir; it would break store-to-load + // forwarding. No preds to push then, so we are done. + return false; + } + + // Now we've checked range before the indirs; continue with its preds + // inside the loop. We will check the range after the indirs once we get to + // it. + BitVecTraits traits = comp->m_dfsTree->PostOrderTraits(); + BitVec visited(BitVecOps::MakeEmpty(&traits)); + + ArrayStack stack(comp->getAllocator(CMK_ArrayStack)); + + auto pushPreds = [=, &traits, &visited, &stack](BasicBlock* block) { + for (BasicBlock* pred : block->PredBlocks()) + { + if (loop->ContainsBlock(pred) && BitVecOps::TryAddElemD(&traits, visited, pred->bbPostorderNum)) + { + stack.Push(pred); + } + } + }; + + pushPreds(m_block); + + while (stack.Height() > 0) + { + BasicBlock* block = stack.Pop(); + + LIR::Range& range = LIR::AsRange(block); + + GenTree* firstNode = block == m_block ? prevIndir : range.FirstNode(); + + if ((firstNode != nullptr) && !checkNodes(range.LastNode(), firstNode, &hasStore, &hasDef)) + { + // Out of budget + return false; + } + + if (hasStore) + { + // This would be store-to-load forwardable. + return true; + } + + if (hasDef) + { + // Redefinition of base local; skip pushing preds + continue; + } + + pushPreds(block); + } + + return false; +} + //------------------------------------------------------------------------ // MarkTree: Mark trees involved in the computation of 'node' recursively. // diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index ccfe965ed6d88..bff33917f28da 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -353,6 +353,7 @@ class Lowering final : public Phase GenTree* LowerIndir(GenTreeIndir* ind); bool OptimizeForLdpStp(GenTreeIndir* ind); bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir); + bool IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir); bool TryMoveAddSubRMWAfterIndir(GenTreeLclVarCommon* store); bool TryMakeIndirAndStoreAdjacent(GenTreeIndir* prevIndir, GenTreeLclVarCommon* store); void MarkTree(GenTree* root);