From a0e3e4df3746fda4d4955f056a94236538518707 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Tue, 29 Oct 2024 17:09:46 +0100 Subject: [PATCH 1/7] JIT: Make loop inversion graph based Rewrite loop inversion to be graph based and to use the new loop representation. --- src/coreclr/jit/compiler.cpp | 8 +- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgdiagnostic.cpp | 7 +- src/coreclr/jit/optimizer.cpp | 524 +++++++++---------------------- 4 files changed, 166 insertions(+), 375 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index bdb8b686a176f..3059cf00145c5 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4869,10 +4869,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl if (opts.OptimizationEnabled()) { - // Invert loops - // - DoPhase(this, PHASE_INVERT_LOOPS, &Compiler::optInvertLoops); - // Run some flow graph optimizations (but don't reorder) // DoPhase(this, PHASE_OPTIMIZE_FLOW, &Compiler::optOptimizeFlow); @@ -4900,6 +4896,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_SET_BLOCK_WEIGHTS, &Compiler::optSetBlockWeights); + // Invert loops + // + DoPhase(this, PHASE_INVERT_LOOPS, &Compiler::optInvertLoops); + // Clone loops with optimization opportunities, and choose one based on dynamic condition evaluation. // DoPhase(this, PHASE_CLONE_LOOPS, &Compiler::optCloneLoops); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 56410dfd3035a..7989ef632bdef 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7145,7 +7145,7 @@ class Compiler OptInvertCountTreeInfoType optInvertCountTreeInfo(GenTree* tree); - bool optInvertWhileLoop(BasicBlock* block); + bool optTryInvertWhileLoop(FlowGraphNaturalLoop* loop); bool optIfConvert(BasicBlock* block); private: diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index c355d02c558c3..745c7d987780c 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -4714,7 +4714,12 @@ void Compiler::fgDebugCheckLoops() loop->VisitRegularExitBlocks([=](BasicBlock* exit) { for (BasicBlock* pred : exit->PredBlocks()) { - assert(loop->ContainsBlock(pred)); + if (!loop->ContainsBlock(pred)) + { + JITDUMP("Loop " FMT_LP " exit " FMT_BB " has non-loop predecessor " FMT_BB "\n", + loop->GetIndex(), exit->bbNum, pred->bbNum); + assert(!"Loop exit has non-loop predecessor"); + } } return BasicBlockVisit::Continue; }); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 161478d63f0c7..4d625eb300637 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -1850,13 +1850,11 @@ Compiler::OptInvertCountTreeInfoType Compiler::optInvertCountTreeInfo(GenTree* t } //----------------------------------------------------------------------------- -// optInvertWhileLoop: modify flow and duplicate code so that for/while loops are +// optTryInvertWhileLoop: modify flow and duplicate code so that for/while loops are // entered at top and tested at bottom (aka loop rotation or bottom testing). // Creates a "zero trip test" condition which guards entry to the loop. // Enables loop invariant hoisting and loop cloning, which depend on -// `do {} while` format loops. Enables creation of a pre-header block after the -// zero trip test to place code that only runs if the loop is guaranteed to -// run at least once. +// `do {} while` format loops. // // Arguments: // block -- block that may be the predecessor of the un-rotated loop's test block. @@ -1864,156 +1862,95 @@ Compiler::OptInvertCountTreeInfoType Compiler::optInvertCountTreeInfo(GenTree* t // Returns: // true if any IR changes possibly made (used to determine phase return status) // -// Notes: -// Uses a simple lexical screen to detect likely loops. -// -// Specifically, we're looking for the following case: -// -// block: -// ... -// jmp test // `block` argument -// top: -// ... -// ... -// test: -// ..stmts.. -// cond -// jtrue top -// -// If we find this, and the condition is simple enough, we change -// the loop to the following: -// -// block: -// ... -// jmp bNewCond -// bNewCond: -// ..stmts.. // duplicated cond block statements -// cond // duplicated cond -// jfalse join -// // else fall-through -// top: -// ... -// ... -// test: -// ..stmts.. -// cond -// jtrue top -// join: -// -// Makes no changes if the flow pattern match fails. -// -// May not modify a loop if profile is unfavorable, if the cost of duplicating -// code is large (factoring in potential CSEs). -// -bool Compiler::optInvertWhileLoop(BasicBlock* block) +bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) { - assert(opts.OptimizationEnabled()); - assert(compCodeOpt() != SMALL_CODE); - - // Does the BB end with an unconditional jump? + // Should have preheaders at this point + assert(loop->EntryEdges().size() == 1); + BasicBlock* preheader = loop->EntryEdge(0)->getSourceBlock(); - if (!block->KindIs(BBJ_ALWAYS) || block->JumpsToNext()) - { - return false; - } + ArrayStack duplicatedBlocks(getAllocator(CMK_LoopOpt)); - if (block->HasFlag(BBF_KEEP_BBJ_ALWAYS)) + BasicBlock* condBlock = loop->GetHeader(); + while (true) { - // It can't be one of the ones we use for our exception magic - return false; - } + if (!BasicBlock::sameEHRegion(preheader, condBlock)) + { + JITDUMP("No loop-inversion for " FMT_LP + " since we could not find a condition block in the same EH region as the preheader\n", + loop->GetIndex()); + return false; + } - // Get hold of the jump target - BasicBlock* const bTest = block->GetTarget(); + duplicatedBlocks.Push(condBlock); - // Does the bTest consist of 'jtrue(cond) block' ? - if (!bTest->KindIs(BBJ_COND)) - { - return false; - } + if (condBlock->KindIs(BBJ_ALWAYS)) + { + condBlock = condBlock->GetTarget(); - // bTest must be a backwards jump to block->bbNext - // This will be the top of the loop. - // - BasicBlock* const bTop = bTest->GetTrueTarget(); + if (!loop->ContainsBlock(condBlock) || (condBlock == loop->GetHeader())) + { + JITDUMP("No loop-inversion for " FMT_LP "; ran out of blocks following BBJ_ALWAYS blocks\n", + loop->GetIndex()); + return false; + } - if (!block->NextIs(bTop)) - { - return false; - } + continue; + } - // Since bTest is a BBJ_COND it will have a false target - // - BasicBlock* const bJoin = bTest->GetFalseTarget(); - noway_assert(bJoin != nullptr); + if (!condBlock->KindIs(BBJ_COND)) + { + JITDUMP("No loop-inversion for " FMT_LP " since we could not find any BBJ_COND block\n", loop->GetIndex()); + return false; + } - // 'block' must be in the same try region as the condition, since we're going to insert a duplicated condition - // in a new block after 'block', and the condition might include exception throwing code. - // On non-funclet platforms (x86), the catch exit is a BBJ_ALWAYS, but we don't want that to - // be considered as the head of a loop, so also disallow different handler regions. - if (!BasicBlock::sameEHRegion(block, bTest)) - { - return false; + break; } - // The duplicated condition block will branch to bTest->GetFalseTarget(), so that also better be in the - // same try region (or no try region) to avoid generating illegal flow. - if (bJoin->hasTryIndex() && !BasicBlock::sameTryRegion(block, bJoin)) - { - return false; - } + bool trueExits = !loop->ContainsBlock(condBlock->GetTrueTarget()); + bool falseExits = !loop->ContainsBlock(condBlock->GetFalseTarget()); - // It has to be a forward jump. Defer this check until after all the cheap checks - // are done, since it iterates forward in the block list looking for block's target. - // TODO-CQ: Check if we can also optimize the backwards jump as well. - // - if (!fgIsForwardBranch(block, block->GetTarget())) + if (trueExits == falseExits) { + JITDUMP("No loop-inversion for " FMT_LP " since we could not find any exiting BBJ_COND block\n", + loop->GetIndex()); return false; } - // Find the loop termination test at the bottom of the loop. - Statement* const condStmt = bTest->lastStmt(); + BasicBlock* exit = trueExits ? condBlock->GetTrueTarget() : condBlock->GetFalseTarget(); - // Verify the test block ends with a conditional that we can manipulate. - GenTree* const condTree = condStmt->GetRootNode(); - noway_assert(condTree->gtOper == GT_JTRUE); - if (!condTree->AsOp()->gtOp1->OperIsCompare()) + // Exiting the loop may enter a new try-region. However, to keep exits canonical, we will + // have to split the exit such that old loop edges exit to one half, while the duplicated condition + // exits to the other half. This will result in jump into the middle of a try-region, which is illegal. + // TODO: We can fix this by placing the first half of the split (which will be an empty block) outside + // the try region. + if (!BasicBlock::sameEHRegion(preheader, exit)) { + JITDUMP("No loop-inversion for " FMT_LP " since the preheader " FMT_BB " and exit " FMT_BB + " are in different EH regions\n", + loop->GetIndex(), preheader->bbNum, exit->bbNum); return false; } - JITDUMP("Matched flow pattern for loop inversion: block " FMT_BB " bTop " FMT_BB " bTest " FMT_BB "\n", - block->bbNum, bTop->bbNum, bTest->bbNum); - - // Estimate the cost of cloning the entire test block. - // - // Note: it would help throughput to compute the maximum cost - // first and early out for large bTest blocks, as we are doing two - // tree walks per tree. But because of this helper call scan, the - // maximum cost depends on the trees in the block. - // - // We might consider flagging blocks with hoistable helper calls - // during importation, so we can avoid the helper search and - // implement an early bail out for large blocks with no helper calls. - // - // Note that gtPrepareCost can cause operand swapping, so we must - // return `true` (possible IR change) from here on. + JITDUMP("Condition in block " FMT_BB " of loop " FMT_LP " is a candidate for duplication to invert the loop\n", + condBlock->bbNum, loop->GetIndex()); unsigned estDupCostSz = 0; - for (Statement* const stmt : bTest->Statements()) + for (int i = 0; i < duplicatedBlocks.Height(); i++) { - GenTree* tree = stmt->GetRootNode(); - gtPrepareCost(tree); - estDupCostSz += tree->GetCostSz(); + BasicBlock* block = duplicatedBlocks.Bottom(i); + for (Statement* stmt : block->Statements()) + { + GenTree* tree = stmt->GetRootNode(); + gtPrepareCost(tree); + estDupCostSz += tree->GetCostSz(); + } } weight_t loopIterations = BB_LOOP_WEIGHT_SCALE; bool allProfileWeightsAreValid = false; - weight_t const weightBlock = block->bbWeight; - weight_t const weightTest = bTest->bbWeight; - weight_t const weightTop = bTop->bbWeight; + weight_t const weightPreheader = preheader->bbWeight; + weight_t const weightCond = condBlock->bbWeight; // If we have profile data then we calculate the number of times // the loop will iterate into loopIterations @@ -2021,46 +1958,19 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) { // Only rely upon the profile weight when all three of these blocks // have good profile weights - if (block->hasProfileWeight() && bTest->hasProfileWeight() && bTop->hasProfileWeight()) + if (preheader->hasProfileWeight() && condBlock->hasProfileWeight()) { // If this while loop never iterates then don't bother transforming // - if (weightTop == BB_ZERO_WEIGHT) + if (weightPreheader == BB_ZERO_WEIGHT) { - return true; - } - - // We generally expect weightTest > weightTop - // - // Tolerate small inconsistencies... - // - if (!fgProfileWeightsConsistent(weightBlock + weightTop, weightTest)) - { - JITDUMP("Profile weights locally inconsistent: block " FMT_WT ", next " FMT_WT ", test " FMT_WT "\n", - weightBlock, weightTop, weightTest); + JITDUMP("No loop-inversion for " FMT_LP " since the the preheader " FMT_BB " has 0 weight\n", + loop->GetIndex(), preheader->bbNum); + return false; } - else - { - allProfileWeightsAreValid = true; - - // Determine average iteration count - // - // weightTop is the number of time this loop executes - // weightTest is the number of times that we consider entering or remaining in the loop - // loopIterations is the average number of times that this loop iterates - // - weight_t loopEntries = weightTest - weightTop; - - // If profile is inaccurate, try and use other data to provide a credible estimate. - // The value should at least be >= weightBlock. - // - if (loopEntries < weightBlock) - { - loopEntries = weightBlock; - } - loopIterations = weightTop / loopEntries; - } + loopIterations = weightCond / weightPreheader; + allProfileWeightsAreValid = true; } else { @@ -2085,9 +1995,7 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) } } - // If the compare has too high cost then we don't want to dup. - - bool costIsTooHigh = (estDupCostSz > maxDupCostSz); + bool costIsTooHigh = estDupCostSz > maxDupCostSz; OptInvertCountTreeInfoType optInvertTotalInfo = {}; if (costIsTooHigh) @@ -2101,32 +2009,37 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) // // If the condition has array.Length operations, also boost, as they are likely to be CSE'd. - for (Statement* const stmt : bTest->Statements()) + for (int i = 0; i < duplicatedBlocks.Height() && costIsTooHigh; i++) { - GenTree* tree = stmt->GetRootNode(); - - OptInvertCountTreeInfoType optInvertInfo = optInvertCountTreeInfo(tree); - optInvertTotalInfo.sharedStaticHelperCount += optInvertInfo.sharedStaticHelperCount; - optInvertTotalInfo.arrayLengthCount += optInvertInfo.arrayLengthCount; - - if ((optInvertInfo.sharedStaticHelperCount > 0) || (optInvertInfo.arrayLengthCount > 0)) + BasicBlock* block = duplicatedBlocks.Bottom(i); + for (Statement* const stmt : block->Statements()) { - // Calculate a new maximum cost. We might be able to early exit. + GenTree* tree = stmt->GetRootNode(); - unsigned newMaxDupCostSz = - maxDupCostSz + 24 * min(optInvertTotalInfo.sharedStaticHelperCount, (int)(loopIterations + 1.5)) + - 8 * optInvertTotalInfo.arrayLengthCount; + OptInvertCountTreeInfoType optInvertInfo = optInvertCountTreeInfo(tree); + optInvertTotalInfo.sharedStaticHelperCount += optInvertInfo.sharedStaticHelperCount; + optInvertTotalInfo.arrayLengthCount += optInvertInfo.arrayLengthCount; - // Is the cost too high now? - costIsTooHigh = (estDupCostSz > newMaxDupCostSz); - if (!costIsTooHigh) + if ((optInvertInfo.sharedStaticHelperCount > 0) || (optInvertInfo.arrayLengthCount > 0)) { - // No need counting any more trees; we're going to do the transformation. - JITDUMP("Decided to duplicate loop condition block after counting helpers in tree [%06u] in " - "block " FMT_BB, - dspTreeID(tree), bTest->bbNum); - maxDupCostSz = newMaxDupCostSz; // for the JitDump output below - break; + // Calculate a new maximum cost. We might be able to early exit. + + unsigned newMaxDupCostSz = + maxDupCostSz + + 24 * min(optInvertTotalInfo.sharedStaticHelperCount, (int)(loopIterations + 1.5)) + + 8 * optInvertTotalInfo.arrayLengthCount; + + // Is the cost too high now? + costIsTooHigh = (estDupCostSz > newMaxDupCostSz); + if (!costIsTooHigh) + { + // No need counting any more trees; we're going to do the transformation. + JITDUMP("Decided to duplicate loop condition block after counting helpers in tree [%06u] in " + "block " FMT_BB, + dspTreeID(tree), block->bbNum); + maxDupCostSz = newMaxDupCostSz; // for the JitDump output below + break; + } } } } @@ -2140,7 +2053,7 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) printf( "\nDuplication of loop condition [%06u] is %s, because the cost of duplication (%i) is %s than %i," "\n loopIterations = %7.3f, optInvertTotalInfo.sharedStaticHelperCount >= %d, validProfileWeights = %s\n", - dspTreeID(condTree), costIsTooHigh ? "not done" : "performed", estDupCostSz, + dspTreeID(condBlock->lastStmt()->GetRootNode()), costIsTooHigh ? "not done" : "performed", estDupCostSz, costIsTooHigh ? "greater" : "less or equal", maxDupCostSz, loopIterations, optInvertTotalInfo.sharedStaticHelperCount, dspBool(allProfileWeightsAreValid)); } @@ -2151,182 +2064,56 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) return true; } - bool foundCondTree = false; - - // Create a new block after `block` to put the copied condition code. - // - BasicBlock* const bNewCond = fgNewBBafter(BBJ_COND, block, /*extendRegion*/ true); - - // Clone each statement in bTest and append to bNewCond. - for (Statement* const stmt : bTest->Statements()) - { - GenTree* originalTree = stmt->GetRootNode(); - GenTree* clonedTree = gtCloneExpr(originalTree); - - // Special case handling needed for the conditional jump tree - if (originalTree == condTree) - { - foundCondTree = true; - - // Get the compare subtrees - GenTree* originalCompareTree = originalTree->AsOp()->gtOp1; - GenTree* clonedCompareTree = clonedTree->AsOp()->gtOp1; - assert(originalCompareTree->OperIsCompare()); - assert(clonedCompareTree->OperIsCompare()); - - // The original test branches to remain in the loop. The - // new cloned test will branch to avoid the loop. So the - // cloned compare needs to reverse the branch condition. - gtReverseCond(clonedCompareTree); - } + // Split the preheader so we can duplicate the statements into it. The new + // block will be a BBJ_COND node. + BasicBlock* newCond = fgSplitBlockAtEnd(preheader); - Statement* clonedStmt = fgNewStmtAtEnd(bNewCond, clonedTree); + // Split the new block once more to create a proper preheader, so we end up + // with preheader (always) -> newCond (cond) -> newPreheader (always) -> header + BasicBlock* newPreheader = fgSplitBlockAtEnd(newCond); - if (opts.compDbgInfo) - { - clonedStmt->SetDebugInfo(stmt->GetDebugInfo()); - } - } + // Make sure exit stays canonical + BasicBlock* nonEnterBlock = fgSplitBlockAtBeginning(exit); - assert(foundCondTree); + JITDUMP("New preheader is " FMT_BB "\n", newPreheader->bbNum); + JITDUMP("Duplicated condition block is " FMT_BB "\n", newCond->bbNum); + JITDUMP("Old exit is " FMT_BB ", new non-enter block is " FMT_BB "\n", exit->bbNum, nonEnterBlock->bbNum); - // Flag the block that received the copy as potentially having various constructs. - bNewCond->CopyFlags(bTest, BBF_COPY_PROPAGATE); + // Get the newCond -> newPreheader edge + FlowEdge* newCondToNewPreheader = newCond->GetTargetEdge(); - // Fix flow and profile - // - bNewCond->inheritWeight(block); + // Add newCond -> nonEnterBlock + FlowEdge* newCondToNewExit = fgAddRefPred(nonEnterBlock, newCond); - if (allProfileWeightsAreValid) - { - weight_t const delta = weightTest - weightTop; + newCond->SetCond(trueExits ? newCondToNewExit : newCondToNewPreheader, + trueExits ? newCondToNewPreheader : newCondToNewExit); - // If there is just one outside edge incident on bTest, then ideally delta == block->bbWeight. - // But this might not be the case if profile data is inconsistent. - // - // And if bTest has multiple outside edges we want to account for the weight of them all. - // - if (delta > block->bbWeight) - { - bNewCond->setBBProfileWeight(delta); - } - } - - // Update pred info - // - // For now we set the likelihood of the newCond branch to match - // the likelihood of the test branch (though swapped, since we're - // currently reversing the condition). This may or may not match - // the block weight adjustments we're making. All this becomes - // easier to reconcile once we rely on edge likelihoods more and - // have synthesis running (so block weights ==> frequencies). - // - // Until then we won't worry that edges and blocks are potentially - // out of sync. - // - FlowEdge* const testTopEdge = bTest->GetTrueEdge(); - FlowEdge* const testJoinEdge = bTest->GetFalseEdge(); - FlowEdge* const newCondJoinEdge = fgAddRefPred(bJoin, bNewCond, testJoinEdge); - FlowEdge* const newCondTopEdge = fgAddRefPred(bTop, bNewCond, testTopEdge); - - bNewCond->SetTrueEdge(newCondJoinEdge); - bNewCond->SetFalseEdge(newCondTopEdge); - - fgRedirectTargetEdge(block, bNewCond); - assert(block->JumpsToNext()); - - // Move all predecessor edges that look like loop entry edges to point to the new cloned condition - // block, not the existing condition block. The idea is that if we only move `block` to point to - // `bNewCond`, but leave other `bTest` predecessors still pointing to `bTest`, when we eventually - // recognize loops, the loop will appear to have multiple entries, which will prevent optimization. - // We don't have loops yet, but blocks should be in increasing lexical numbered order, so use that - // as the proxy for predecessors that are "in" versus "out" of the potential loop. Note that correctness - // is maintained no matter which condition block we point to, but we'll lose optimization potential - // (and create spaghetti code) if we get it wrong. - // - unsigned const loopFirstNum = bTop->bbNum; - unsigned const loopBottomNum = bTest->bbNum; - for (BasicBlock* const predBlock : bTest->PredBlocksEditing()) - { - unsigned const bNum = predBlock->bbNum; - if ((loopFirstNum <= bNum) && (bNum <= loopBottomNum)) - { - // Looks like the predecessor is from within the potential loop; skip it. - continue; - } + newCond->GetTrueEdge()->setLikelihood(condBlock->GetTrueEdge()->getLikelihood()); + newCond->GetFalseEdge()->setLikelihood(condBlock->GetFalseEdge()->getLikelihood()); - // Redirect the predecessor to the new block. - JITDUMP("Redirecting non-loop " FMT_BB " -> " FMT_BB " to " FMT_BB " -> " FMT_BB "\n", predBlock->bbNum, - bTest->bbNum, predBlock->bbNum, bNewCond->bbNum); + BasicBlock* newHeader = trueExits ? condBlock->GetFalseTarget() : condBlock->GetTrueTarget(); - switch (predBlock->GetKind()) - { - case BBJ_ALWAYS: - case BBJ_CALLFINALLY: - case BBJ_CALLFINALLYRET: - case BBJ_COND: - case BBJ_SWITCH: - case BBJ_EHFINALLYRET: - fgReplaceJumpTarget(predBlock, bTest, bNewCond); - break; + // Add newPreheader -> newHeader + FlowEdge* newPreheaderToNewHeader = fgAddRefPred(newHeader, newPreheader, newPreheader->GetTargetEdge()); - case BBJ_EHCATCHRET: - case BBJ_EHFILTERRET: - // These block types should not need redirecting - break; + // Remove newPreheader -> header + fgRemoveRefPred(newPreheader->GetTargetEdge()); - default: - assert(!"Unexpected bbKind for predecessor block"); - break; - } - } + // Update newPreheader to point to newHeader + newPreheader->SetTargetEdge(newPreheaderToNewHeader); - // If we have profile data for all blocks and we know that we are cloning the - // `bTest` block into `bNewCond` and thus changing the control flow from `block` so - // that it no longer goes directly to `bTest` anymore, we have to adjust - // various weights. - // - if (allProfileWeightsAreValid) + // Duplicate all the code now + for (int i = 0; i < duplicatedBlocks.Height(); i++) { - // Update the weight for bTest. Normally, this reduces the weight of the bTest, except in odd - // cases of stress modes with inconsistent weights. - // - JITDUMP("Reducing profile weight of " FMT_BB " from " FMT_WT " to " FMT_WT "\n", bTest->bbNum, weightTest, - weightTop); - bTest->inheritWeight(bTop); - -#ifdef DEBUG - // If we're checking profile data, see if profile for the two target blocks is consistent. - // - if ((activePhaseChecks & PhaseChecks::CHECK_PROFILE) == PhaseChecks::CHECK_PROFILE) + BasicBlock* block = duplicatedBlocks.Bottom(i); + for (Statement* stmt : block->Statements()) { - if (JitConfig.JitProfileChecks() > 0) - { - const ProfileChecks checks = (ProfileChecks)JitConfig.JitProfileChecks(); - const bool nextProfileOk = fgDebugCheckIncomingProfileData(bNewCond->GetFalseTarget(), checks); - const bool jumpProfileOk = fgDebugCheckIncomingProfileData(bNewCond->GetTrueTarget(), checks); - - if (hasFlag(checks, ProfileChecks::RAISE_ASSERT)) - { - assert(nextProfileOk); - assert(jumpProfileOk); - } - } + GenTree* clonedTree = gtCloneExpr(stmt->GetRootNode()); + fgNewStmtAtEnd(newCond, clonedTree, stmt->GetDebugInfo()); } -#endif // DEBUG - } - -#ifdef DEBUG - if (verbose) - { - printf("\nDuplicated loop exit block at " FMT_BB " for loop (" FMT_BB " - " FMT_BB ")\n", bNewCond->bbNum, - bNewCond->GetFalseTarget()->bbNum, bTest->bbNum); - printf("Estimated code size expansion is %d\n", estDupCostSz); - fgDumpBlock(bNewCond); - fgDumpBlock(bTest); + newCond->CopyFlags(block, BBF_COPY_PROPAGATE); } -#endif // DEBUG return true; } @@ -2339,9 +2126,6 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block) // PhaseStatus Compiler::optInvertLoops() { - noway_assert(opts.OptimizationEnabled()); - noway_assert(fgModified == false); - #if defined(OPT_CONFIG) if (!JitConfig.JitDoLoopInversion()) { @@ -2350,37 +2134,39 @@ PhaseStatus Compiler::optInvertLoops() } #endif // OPT_CONFIG - bool madeChanges = fgRenumberBlocks(); + bool madeChanges = false; - if (compCodeOpt() == SMALL_CODE) - { - // do not invert any loops - } - else + if (compCodeOpt() != SMALL_CODE) { - for (BasicBlock* const block : Blocks()) - { - // Make sure the appropriate fields are initialized - // - if (block->bbWeight == BB_ZERO_WEIGHT) - { - // Zero weighted block can't have a LOOP_HEAD flag - noway_assert(block->isLoopHead() == false); - continue; - } + fgDumpFlowGraph(PHASE_INVERT_LOOPS, PhasePosition::PostPhase); - if (optInvertWhileLoop(block)) - { - madeChanges = true; - } + for (FlowGraphNaturalLoop* loop : m_loops->InPostOrder()) + { + madeChanges |= optTryInvertWhileLoop(loop); } } - if (fgModified) + if (madeChanges) { - // Reset fgModified here as we've done a consistent set of edits. - // - fgModified = false; + fgInvalidateDfsTree(); + m_dfsTree = fgComputeDfs(); + m_loops = FlowGraphNaturalLoops::Find(m_dfsTree); + + // In normal cases no further canonicalization will be required as we + // try to ensure loops stay canonicalized during inversion. However, + // duplicating the condition outside an inner loop can introduce new + // exits in the parent loop, so we may rarely need to recanonicalize + // exit blocks. + if (optCanonicalizeLoops()) + { + fgInvalidateDfsTree(); + m_dfsTree = fgComputeDfs(); + m_loops = FlowGraphNaturalLoops::Find(m_dfsTree); + } + + fgDebugCheckLoops(); + + fgRenumberBlocks(); } return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; From 5df0255da6cec94cae87c13bc55b7e3078389013 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Tue, 29 Oct 2024 17:14:00 +0100 Subject: [PATCH 2/7] Remove debug code --- src/coreclr/jit/optimizer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 4d625eb300637..504bd20c78149 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2138,8 +2138,6 @@ PhaseStatus Compiler::optInvertLoops() if (compCodeOpt() != SMALL_CODE) { - fgDumpFlowGraph(PHASE_INVERT_LOOPS, PhasePosition::PostPhase); - for (FlowGraphNaturalLoop* loop : m_loops->InPostOrder()) { madeChanges |= optTryInvertWhileLoop(loop); From b75aea72359f31d151f25aa6396ccd7375cf9cbd Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Tue, 29 Oct 2024 17:34:45 +0100 Subject: [PATCH 3/7] Fix release build --- src/coreclr/jit/optimizer.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 504bd20c78149..a2a15bbf88c74 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2109,7 +2109,17 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) for (Statement* stmt : block->Statements()) { GenTree* clonedTree = gtCloneExpr(stmt->GetRootNode()); - fgNewStmtAtEnd(newCond, clonedTree, stmt->GetDebugInfo()); + Statement* clonedStmt = fgNewStmtAtEnd(newCond, clonedTree, stmt->GetDebugInfo()); + + if (stmt == condBlock->lastStmt()) + { + // TODO: This ought not to be necessary, but has large negative diffs if we don't do it + assert(clonedStmt->GetRootNode()->OperIs(GT_JTRUE)); + clonedStmt->GetRootNode()->AsUnOp()->gtOp1 = gtReverseCond(clonedStmt->GetRootNode()->gtGetOp1()); + newCond->SetCond(newCond->GetFalseEdge(), newCond->GetTrueEdge()); + } + + DISPSTMT(clonedStmt); } newCond->CopyFlags(block, BBF_COPY_PROPAGATE); @@ -2162,8 +2172,6 @@ PhaseStatus Compiler::optInvertLoops() m_loops = FlowGraphNaturalLoops::Find(m_dfsTree); } - fgDebugCheckLoops(); - fgRenumberBlocks(); } From d75f60e4e770912a63b6773362ea30d4bf6f29a7 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 30 Oct 2024 15:41:55 +0100 Subject: [PATCH 4/7] Avoid inverting already-inverted loops, duplicate weight manipulation from old inversion --- src/coreclr/jit/optimizer.cpp | 124 +++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a2a15bbf88c74..0c718e991a2ff 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -1917,6 +1917,14 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) } BasicBlock* exit = trueExits ? condBlock->GetTrueTarget() : condBlock->GetFalseTarget(); + BasicBlock* stayInLoopSucc = trueExits ? condBlock->GetFalseTarget() : condBlock->GetTrueTarget(); + + // If the condition is already a latch, then the loop is already inverted + if (stayInLoopSucc == loop->GetHeader()) + { + JITDUMP("No loop-inversion for " FMT_LP " since it is already inverted\n", loop->GetIndex()); + return false; + } // Exiting the loop may enter a new try-region. However, to keep exits canonical, we will // have to split the exit such that old loop edges exit to one half, while the duplicated condition @@ -1951,6 +1959,7 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) bool allProfileWeightsAreValid = false; weight_t const weightPreheader = preheader->bbWeight; weight_t const weightCond = condBlock->bbWeight; + weight_t const weightStayInLoopSucc = stayInLoopSucc->bbWeight; // If we have profile data then we calculate the number of times // the loop will iterate into loopIterations @@ -1958,19 +1967,48 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) { // Only rely upon the profile weight when all three of these blocks // have good profile weights - if (preheader->hasProfileWeight() && condBlock->hasProfileWeight()) + if (preheader->hasProfileWeight() && condBlock->hasProfileWeight() && stayInLoopSucc->hasProfileWeight()) { // If this while loop never iterates then don't bother transforming // - if (weightPreheader == BB_ZERO_WEIGHT) + if (weightStayInLoopSucc == BB_ZERO_WEIGHT) { - JITDUMP("No loop-inversion for " FMT_LP " since the the preheader " FMT_BB " has 0 weight\n", + JITDUMP("No loop-inversion for " FMT_LP " since the in-loop successor " FMT_BB " has 0 weight\n", loop->GetIndex(), preheader->bbNum); return false; } - loopIterations = weightCond / weightPreheader; - allProfileWeightsAreValid = true; + // We generally expect weightCond > weightStayInLoopSucc + // + // Tolerate small inconsistencies... + // + if (!fgProfileWeightsConsistent(weightPreheader + weightStayInLoopSucc, weightCond)) + { + JITDUMP("Profile weights locally inconsistent: preheader " FMT_WT ", stayInLoopSucc " FMT_WT ", cond " FMT_WT "\n", + weightPreheader, weightStayInLoopSucc, weightCond); + } + else + { + allProfileWeightsAreValid = true; + + // Determine average iteration count + // + // weightTop is the number of time this loop executes + // weightTest is the number of times that we consider entering or remaining in the loop + // loopIterations is the average number of times that this loop iterates + // + weight_t loopEntries = weightCond - weightStayInLoopSucc; + + // If profile is inaccurate, try and use other data to provide a credible estimate. + // The value should at least be >= weightBlock. + // + if (loopEntries < weightPreheader) + { + loopEntries = weightPreheader; + } + + loopIterations = weightStayInLoopSucc / loopEntries; + } } else { @@ -2068,6 +2106,21 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) // block will be a BBJ_COND node. BasicBlock* newCond = fgSplitBlockAtEnd(preheader); + if (allProfileWeightsAreValid) + { + weight_t const delta = weightCond - weightStayInLoopSucc; + + // If there is just one outside edge incident on bTest, then ideally delta == block->bbWeight. + // But this might not be the case if profile data is inconsistent. + // + // And if bTest has multiple outside edges we want to account for the weight of them all. + // + if (delta > preheader->bbWeight) + { + newCond->setBBProfileWeight(delta); + } + } + // Split the new block once more to create a proper preheader, so we end up // with preheader (always) -> newCond (cond) -> newPreheader (always) -> header BasicBlock* newPreheader = fgSplitBlockAtEnd(newCond); @@ -2091,16 +2144,14 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) newCond->GetTrueEdge()->setLikelihood(condBlock->GetTrueEdge()->getLikelihood()); newCond->GetFalseEdge()->setLikelihood(condBlock->GetFalseEdge()->getLikelihood()); - BasicBlock* newHeader = trueExits ? condBlock->GetFalseTarget() : condBlock->GetTrueTarget(); - - // Add newPreheader -> newHeader - FlowEdge* newPreheaderToNewHeader = fgAddRefPred(newHeader, newPreheader, newPreheader->GetTargetEdge()); + // Add newPreheader -> stayInLoopSucc + FlowEdge* newPreheaderToInLoopSucc = fgAddRefPred(stayInLoopSucc, newPreheader, newPreheader->GetTargetEdge()); // Remove newPreheader -> header fgRemoveRefPred(newPreheader->GetTargetEdge()); // Update newPreheader to point to newHeader - newPreheader->SetTargetEdge(newPreheaderToNewHeader); + newPreheader->SetTargetEdge(newPreheaderToInLoopSucc); // Duplicate all the code now for (int i = 0; i < duplicatedBlocks.Height(); i++) @@ -2125,6 +2176,59 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) newCond->CopyFlags(block, BBF_COPY_PROPAGATE); } + // If we have profile data for all blocks and we know that we are cloning the + // `bTest` block into `bNewCond` and thus changing the control flow from `block` so + // that it no longer goes directly to `bTest` anymore, we have to adjust + // various weights. + // + if (allProfileWeightsAreValid) + { + // Update the weight for the duplicated blocks. Normally, this reduces + // the weight of condBlock, except in odd cases of stress modes with + // inconsistent weights. + // + for (int i = 0; i < duplicatedBlocks.Height(); i++) + { + BasicBlock* block = duplicatedBlocks.Bottom(i); + JITDUMP("Reducing profile weight of " FMT_BB " from " FMT_WT " to " FMT_WT "\n", block->bbNum, weightCond, + weightStayInLoopSucc); + block->inheritWeight(stayInLoopSucc); + } + +#ifdef DEBUG + // If we're checking profile data, see if profile for the two target blocks is consistent. + // + if ((activePhaseChecks & PhaseChecks::CHECK_PROFILE) == PhaseChecks::CHECK_PROFILE) + { + if (JitConfig.JitProfileChecks() > 0) + { + const ProfileChecks checks = (ProfileChecks)JitConfig.JitProfileChecks(); + const bool nextProfileOk = fgDebugCheckIncomingProfileData(newCond->GetFalseTarget(), checks); + const bool jumpProfileOk = fgDebugCheckIncomingProfileData(newCond->GetTrueTarget(), checks); + + if (hasFlag(checks, ProfileChecks::RAISE_ASSERT)) + { + assert(nextProfileOk); + assert(jumpProfileOk); + } + } + } +#endif // DEBUG + } + +#ifdef DEBUG + if (verbose) + { + printf("\nDuplicated loop exit block at " FMT_BB " for loop " FMT_LP "\n", newCond->bbNum, + loop->GetIndex()); + printf("Estimated code size expansion is %d\n", estDupCostSz); + + fgDumpBlock(newCond); + fgDumpBlock(condBlock); + } +#endif // DEBUG + + return true; } From ddc0ea2f449ec303a80ef685f059ee6ce3c508fb Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 30 Oct 2024 16:00:31 +0100 Subject: [PATCH 5/7] Add a couple of quirks --- src/coreclr/jit/optimizer.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 0c718e991a2ff..3198e2ea4cd8a 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -1926,6 +1926,20 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) return false; } + // TODO-Quirk: Remove this + if (preheader->JumpsToNext()) + { + JITDUMP("No loop inversion for " FMT_LP " since its preheader is already fallthrough\n", loop->GetIndex()); + return false; + } + + // TODO-Quirk: Remove this + if (!preheader->NextIs(stayInLoopSucc)) + { + JITDUMP("No loop inversion for " FMT_LP " since its preheader does not fall through to \"stayInLoopSucc\"\n", loop->GetIndex()); + return false; + } + // Exiting the loop may enter a new try-region. However, to keep exits canonical, we will // have to split the exit such that old loop edges exit to one half, while the duplicated condition // exits to the other half. This will result in jump into the middle of a try-region, which is illegal. From 99a432151c6173c41c118aa28716b64e0dc34c24 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 30 Oct 2024 16:23:20 +0100 Subject: [PATCH 6/7] Run jit-format --- src/coreclr/jit/optimizer.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 3198e2ea4cd8a..2d55c6c05abdb 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -1916,7 +1916,7 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) return false; } - BasicBlock* exit = trueExits ? condBlock->GetTrueTarget() : condBlock->GetFalseTarget(); + BasicBlock* exit = trueExits ? condBlock->GetTrueTarget() : condBlock->GetFalseTarget(); BasicBlock* stayInLoopSucc = trueExits ? condBlock->GetFalseTarget() : condBlock->GetTrueTarget(); // If the condition is already a latch, then the loop is already inverted @@ -1936,7 +1936,8 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) // TODO-Quirk: Remove this if (!preheader->NextIs(stayInLoopSucc)) { - JITDUMP("No loop inversion for " FMT_LP " since its preheader does not fall through to \"stayInLoopSucc\"\n", loop->GetIndex()); + JITDUMP("No loop inversion for " FMT_LP " since its preheader does not fall through to \"stayInLoopSucc\"\n", + loop->GetIndex()); return false; } @@ -1973,7 +1974,7 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) bool allProfileWeightsAreValid = false; weight_t const weightPreheader = preheader->bbWeight; weight_t const weightCond = condBlock->bbWeight; - weight_t const weightStayInLoopSucc = stayInLoopSucc->bbWeight; + weight_t const weightStayInLoopSucc = stayInLoopSucc->bbWeight; // If we have profile data then we calculate the number of times // the loop will iterate into loopIterations @@ -1998,7 +1999,8 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) // if (!fgProfileWeightsConsistent(weightPreheader + weightStayInLoopSucc, weightCond)) { - JITDUMP("Profile weights locally inconsistent: preheader " FMT_WT ", stayInLoopSucc " FMT_WT ", cond " FMT_WT "\n", + JITDUMP("Profile weights locally inconsistent: preheader " FMT_WT ", stayInLoopSucc " FMT_WT + ", cond " FMT_WT "\n", weightPreheader, weightStayInLoopSucc, weightCond); } else @@ -2173,7 +2175,7 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) BasicBlock* block = duplicatedBlocks.Bottom(i); for (Statement* stmt : block->Statements()) { - GenTree* clonedTree = gtCloneExpr(stmt->GetRootNode()); + GenTree* clonedTree = gtCloneExpr(stmt->GetRootNode()); Statement* clonedStmt = fgNewStmtAtEnd(newCond, clonedTree, stmt->GetDebugInfo()); if (stmt == condBlock->lastStmt()) @@ -2233,8 +2235,7 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) #ifdef DEBUG if (verbose) { - printf("\nDuplicated loop exit block at " FMT_BB " for loop " FMT_LP "\n", newCond->bbNum, - loop->GetIndex()); + printf("\nDuplicated loop exit block at " FMT_BB " for loop " FMT_LP "\n", newCond->bbNum, loop->GetIndex()); printf("Estimated code size expansion is %d\n", estDupCostSz); fgDumpBlock(newCond); @@ -2242,7 +2243,6 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) } #endif // DEBUG - return true; } From 757d14c162a5ef1349183c37b935574051ee739b Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 30 Oct 2024 16:25:46 +0100 Subject: [PATCH 7/7] Add a metric for loops inverted --- src/coreclr/jit/jitmetadatalist.h | 1 + src/coreclr/jit/optimizer.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 4642ca14d7b7c..452793cd81af0 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -35,6 +35,7 @@ JITMETADATAMETRIC(GCInfoBytes, int, JIT_M JITMETADATAMETRIC(EHClauseCount, int, 0) JITMETADATAMETRIC(PhysicallyPromotedFields, int, 0) JITMETADATAMETRIC(LoopsFoundDuringOpts, int, 0) +JITMETADATAMETRIC(LoopsInverted, int, 0) JITMETADATAMETRIC(LoopsCloned, int, 0) JITMETADATAMETRIC(LoopsUnrolled, int, 0) JITMETADATAMETRIC(LoopAlignmentCandidates, int, 0) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 2d55c6c05abdb..3122f4e296cb6 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2243,6 +2243,8 @@ bool Compiler::optTryInvertWhileLoop(FlowGraphNaturalLoop* loop) } #endif // DEBUG + Metrics.LoopsInverted++; + return true; }