From 9885fbeeb64340049295c5d4509f6b58992200cf Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Thu, 30 Jun 2022 13:57:57 -0700 Subject: [PATCH] Improve LSRA and other dump output (#71499) E.g., Update LSRA "Allocating Registers" table description. Dump nodes added during resolution, e.g.: ``` BB29 bottom (BB08->BB08): move V25 from STK to rdi (Critical) N001 ( 1, 1) [001174] ----------z t1174 = LCL_VAR int V25 cse4 rdi REG rdi ``` Dump more data in the LSRA block sequence data: ``` -BB03( 16 ) -BB04( 4 ) +BB03 ( 16 ) critical-in critical-out +BB04 ( 4 ) critical-out ``` When dumping various flow bitvectors, annotate the bitvectors better: ``` -BB25 in gen out -0000000000000000 -0000000000000003 CSE #01.c -0000000000000003 CSE #01.c +BB25 + in: 0000000000000000 +gen: 0000000000000003 CSE #01.c +out: 0000000000000003 CSE #01.c ``` Dump hoisting bitvectors using the sorting number: ``` - USEDEF (5)={V04 V00 V01 V02 V03} + USEDEF (5)={V00 V01 V02 V03 V04} ``` Also, fix various typos and formatting. --- src/coreclr/jit/compiler.cpp | 1 + src/coreclr/jit/flowgraph.cpp | 2 +- src/coreclr/jit/gentree.h | 6 +- src/coreclr/jit/importer.cpp | 25 +++--- src/coreclr/jit/lower.cpp | 1 + src/coreclr/jit/lsra.cpp | 150 +++++++++++++++++++++------------ src/coreclr/jit/lsra.h | 12 ++- src/coreclr/jit/lsrabuild.cpp | 23 ++--- src/coreclr/jit/optcse.cpp | 58 ++++++------- src/coreclr/jit/optimizer.cpp | 16 ++-- src/coreclr/jit/rangecheck.cpp | 5 +- src/coreclr/jit/valuenum.cpp | 2 +- 12 files changed, 173 insertions(+), 128 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 9aeb0ae0ac3b6..9ec90da68dd0b 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4706,6 +4706,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Figure out what locals are address-taken. // DoPhase(this, PHASE_STR_ADRLCL, &Compiler::fgMarkAddressExposedLocals); + // Run a simple forward substitution pass. // DoPhase(this, PHASE_FWD_SUB, &Compiler::fgForwardSub); diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 925f862a4bda1..f1e74ff10094c 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1894,7 +1894,7 @@ GenTree* Compiler::fgCreateMonitorTree(unsigned lvaMonAcquired, unsigned lvaThis { // have to insert this immediately before the GT_RETURN so we transform: // ret(...) -> - // ret(comma(comma(tmp=...,call mon_exit), tmp) + // ret(comma(comma(tmp=...,call mon_exit), tmp)) // // // Before morph stage, it is possible to have a case of GT_RETURN(TYP_LONG, op1) where op1's type is diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 5d50a6aaf79c0..f3dee9573eb53 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1233,7 +1233,7 @@ struct GenTree return (gtOper == GT_LCL_FLD || gtOper == GT_LCL_FLD_ADDR || gtOper == GT_STORE_LCL_FLD); } - inline bool OperIsLocalField() const + bool OperIsLocalField() const { return OperIsLocalField(gtOper); } @@ -8673,13 +8673,13 @@ inline GenTree* GenTree::gtEffectiveVal(bool commaOnly /* = false */) } //------------------------------------------------------------------------- -// gtCommaAssignVal - find value being assigned to a comma wrapped assigment +// gtCommaAssignVal - find value being assigned to a comma wrapped assignment // // Returns: // tree representing value being assigned if this tree represents a // comma-wrapped local definition and use. // -// original tree, of not. +// original tree, if not. // inline GenTree* GenTree::gtCommaAssignVal() { diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index ddb9184a7dfee..3caad0781ac9d 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -3188,7 +3188,7 @@ GenTree* Compiler::impInitializeArrayIntrinsic(CORINFO_SIG_INFO* sig) GenTree* arrayLocalNode = impStackTop(1).val; // - // Verify that the field token is known and valid. Note that It's also + // Verify that the field token is known and valid. Note that it's also // possible for the token to come from reflection, in which case we cannot do // the optimization and must therefore revert to calling the helper. You can // see an example of this in bvt\DynIL\initarray2.exe (in Main). @@ -8108,7 +8108,7 @@ void Compiler::impImportAndPushBox(CORINFO_RESOLVED_TOKEN* pResolvedToken) } //------------------------------------------------------------------------ -// impImportNewObjArray: Build and import `new` of multi-dimmensional array +// impImportNewObjArray: Build and import `new` of multi-dimensional array // // Arguments: // pResolvedToken - The CORINFO_RESOLVED_TOKEN that has been initialized @@ -8123,7 +8123,7 @@ void Compiler::impImportAndPushBox(CORINFO_RESOLVED_TOKEN* pResolvedToken) // Notes: // Multi-dimensional array constructors are imported as calls to a JIT // helper, not as regular calls. - +// void Compiler::impImportNewObjArray(CORINFO_RESOLVED_TOKEN* pResolvedToken, CORINFO_CALL_INFO* pCallInfo) { GenTree* classHandle = impParentClassTokenToHandle(pResolvedToken); @@ -8160,7 +8160,7 @@ void Compiler::impImportNewObjArray(CORINFO_RESOLVED_TOKEN* pResolvedToken, CORI // The arguments of the CORINFO_HELP_NEW_MDARR helper are: // - Array class handle // - Number of dimension arguments - // - Pointer to block of int32 dimensions - address of lvaNewObjArrayArgs temp. + // - Pointer to block of int32 dimensions: address of lvaNewObjArrayArgs temp. // node = gtNewLclvNode(lvaNewObjArrayArgs, TYP_BLK); @@ -13695,7 +13695,7 @@ void Compiler::impImportBlockCode(BasicBlock* block) { lclTyp = JITtype2varType(cit); } - goto ARR_LD_POST_VERIFY; + goto ARR_LD; } // Similarly, if its a readonly access, we can do a simple address-of @@ -13703,7 +13703,7 @@ void Compiler::impImportBlockCode(BasicBlock* block) if (prefixFlags & PREFIX_READONLY) { lclTyp = TYP_REF; - goto ARR_LD_POST_VERIFY; + goto ARR_LD; } // Otherwise we need the full helper function with run-time type check @@ -13747,7 +13747,7 @@ void Compiler::impImportBlockCode(BasicBlock* block) tiRetVal = verMakeTypeInfo(ldelemClsHnd); // precise type always needed for struct tiRetVal.NormaliseForStack(); } - goto ARR_LD_POST_VERIFY; + goto ARR_LD; case CEE_LDELEM_I1: lclTyp = TYP_BYTE; @@ -13784,7 +13784,6 @@ void Compiler::impImportBlockCode(BasicBlock* block) goto ARR_LD; ARR_LD: - ARR_LD_POST_VERIFY: op2 = impPopStack().val; // index op1 = impPopStack().val; // array @@ -15345,14 +15344,14 @@ void Compiler::impImportBlockCode(BasicBlock* block) // Insert the security callout before any actual code is generated impHandleAccessAllowed(callInfo.accessAllowed, &callInfo.callsiteCalloutHelper); - // There are three different cases for new - // Object size is variable (depends on arguments) + // There are three different cases for new. + // Object size is variable (depends on arguments). // 1) Object is an array (arrays treated specially by the EE) // 2) Object is some other variable sized object (e.g. String) // 3) Class Size can be determined beforehand (normal case) - // In the first case, we need to call a NEWOBJ helper (multinewarray) - // in the second case we call the constructor with a '0' this pointer - // In the third case we alloc the memory, then call the constuctor + // In the first case, we need to call a NEWOBJ helper (multinewarray). + // In the second case we call the constructor with a '0' this pointer. + // In the third case we alloc the memory, then call the constuctor. clsFlags = callInfo.classFlags; if (clsFlags & CORINFO_FLG_ARRAY) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index b8baead05dafb..c41145acafeec 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -259,6 +259,7 @@ GenTree* Lowering::LowerNode(GenTree* node) ContainCheckBoundsChk(node->AsBoundsChk()); break; #endif // TARGET_XARCH + case GT_ARR_ELEM: return LowerArrElem(node); diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 38921927f2091..a80a7dda2fdf9 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -949,22 +949,33 @@ void LinearScan::setBlockSequence() assert(isBlockVisited(block)); } - JITDUMP("Final LSRA Block Sequence: \n"); - int i = 1; - for (BasicBlock *block = startBlockSequence(); block != nullptr; ++i, block = moveToNextBlock()) + JITDUMP("Final LSRA Block Sequence:\n"); + for (BasicBlock* block = startBlockSequence(); block != nullptr; block = moveToNextBlock()) { JITDUMP(FMT_BB, block->bbNum); - JITDUMP("(%6s) ", refCntWtd2str(block->getBBWeight(compiler))); - if (blockInfo[block->bbNum].hasEHBoundaryIn) + const LsraBlockInfo& bi = blockInfo[block->bbNum]; + + // Note that predBBNum isn't set yet. + JITDUMP(" (%6s)", refCntWtd2str(bi.weight)); + + if (bi.hasCriticalInEdge) + { + JITDUMP(" critical-in"); + } + if (bi.hasCriticalOutEdge) + { + JITDUMP(" critical-out"); + } + if (bi.hasEHBoundaryIn) { JITDUMP(" EH-in"); } - if (blockInfo[block->bbNum].hasEHBoundaryOut) + if (bi.hasEHBoundaryOut) { JITDUMP(" EH-out"); } - if (blockInfo[block->bbNum].hasEHPred) + if (bi.hasEHPred) { JITDUMP(" has EH pred"); } @@ -2268,7 +2279,7 @@ void LinearScan::checkLastUses(BasicBlock* block) // findPredBlockForLiveIn: Determine which block should be used for the register locations of the live-in variables. // // Arguments: -// block - The block for which we're selecting a predecesor. +// block - The block for which we're selecting a predecessor. // prevBlock - The previous block in allocation order. // pPredBlockIsAllocated - A debug-only argument that indicates whether any of the predecessors have been seen // in allocation order. @@ -7035,13 +7046,13 @@ void LinearScan::resolveRegisters() JITDUMP(" EH flow out"); } - printf("\nuse def in out\n"); + printf("\nuse: "); dumpConvertedVarSet(compiler, block->bbVarUse); - printf("\n"); + printf("\ndef: "); dumpConvertedVarSet(compiler, block->bbVarDef); - printf("\n"); + printf("\n in: "); dumpConvertedVarSet(compiler, block->bbLiveIn); - printf("\n"); + printf("\nout: "); dumpConvertedVarSet(compiler, block->bbLiveOut); printf("\n"); @@ -7280,7 +7291,9 @@ void LinearScan::insertMove( } dst->SetUnusedValue(); - LIR::Range treeRange = LIR::SeqTree(compiler, dst); + LIR::Range treeRange = LIR::SeqTree(compiler, dst); + DISPRANGE(treeRange); + LIR::Range& blockRange = LIR::AsRange(block); if (insertionPoint != nullptr) @@ -7382,7 +7395,7 @@ void LinearScan::insertSwap( // // Arguments: // fromBlock - The "from" block on the edge being resolved. -// toBlock - The "to"block on the edge +// toBlock - The "to" block on the edge // type - the type of register required // // Return Value: @@ -7473,6 +7486,8 @@ regNumber LinearScan::getTempRegForResolution(BasicBlock* fromBlock, BasicBlock* // toReg - the register to which the var is moving // fromReg - the register from which the var is moving // resolveType - the type of resolution to be performed +// fromBlock - "from" block of resolution edge +// toBlock - "to" block of resolution edge // // Return Value: // None. @@ -7486,7 +7501,8 @@ void LinearScan::addResolutionForDouble(BasicBlock* block, regNumberSmall* location, regNumber toReg, regNumber fromReg, - ResolveType resolveType) + ResolveType resolveType DEBUG_ARG(BasicBlock* fromBlock) + DEBUG_ARG(BasicBlock* toBlock)) { regNumber secondHalfTargetReg = REG_NEXT(fromReg); Interval* intervalToBeMoved1 = sourceIntervals[fromReg]; @@ -7507,8 +7523,8 @@ void LinearScan::addResolutionForDouble(BasicBlock* block, // TYP_FLOAT interval occupies 1st half of double register, i.e. 1st float register assert(genIsValidFloatReg(toReg)); } - addResolution(block, insertionPoint, intervalToBeMoved1, toReg, fromReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, intervalToBeMoved1, toReg, + fromReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) DEBUG_ARG(resolveTypeName[resolveType])); location[fromReg] = (regNumberSmall)toReg; } @@ -7518,8 +7534,9 @@ void LinearScan::addResolutionForDouble(BasicBlock* block, assert(intervalToBeMoved2->registerType == TYP_FLOAT); regNumber secondHalfTempReg = REG_NEXT(toReg); - addResolution(block, insertionPoint, intervalToBeMoved2, secondHalfTempReg, secondHalfTargetReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, intervalToBeMoved2, secondHalfTempReg, + secondHalfTargetReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) + DEBUG_ARG(resolveTypeName[resolveType])); location[secondHalfTargetReg] = (regNumberSmall)secondHalfTempReg; } @@ -7536,6 +7553,9 @@ void LinearScan::addResolutionForDouble(BasicBlock* block, // interval - the interval of the var to be moved // toReg - the register to which the var is moving // fromReg - the register from which the var is moving +// fromBlock - "from" block of resolution edge +// toBlock - "to" block of resolution edge +// reason - textual description of the resolution type // // Return Value: // None. @@ -7549,9 +7569,13 @@ void LinearScan::addResolutionForDouble(BasicBlock* block, // REG_STK, and we insert at the bottom (leave insertionPoint as nullptr). // The next time, we want to move from the stack to the destination (toReg), // in which case fromReg will be REG_STK, and we insert at the top. - -void LinearScan::addResolution( - BasicBlock* block, GenTree* insertionPoint, Interval* interval, regNumber toReg, regNumber fromReg) +// +void LinearScan::addResolution(BasicBlock* block, + GenTree* insertionPoint, + Interval* interval, + regNumber toReg, + regNumber fromReg DEBUG_ARG(BasicBlock* fromBlock) DEBUG_ARG(BasicBlock* toBlock) + DEBUG_ARG(const char* reason)) { #ifdef DEBUG const char* insertionPointString; @@ -7572,13 +7596,21 @@ void LinearScan::addResolution( insertionPointString = "top"; } - // We should never add resolution move inside BBCallAlwaysPairTail. - noway_assert(!block->isBBCallAlwaysPairTail()); - + assert(fromBlock != nullptr); + JITDUMP(" " FMT_BB " %s", block->bbNum, insertionPointString); + if (toBlock == nullptr) + { + // SharedCritical resolution has no `toBlock`. + } + else + { + JITDUMP(" (" FMT_BB "->" FMT_BB ")", fromBlock->bbNum, toBlock->bbNum); + } + JITDUMP(": move V%02u from %s to %s (%s)\n", interval->varNum, getRegName(fromReg), getRegName(toReg), reason); #endif // DEBUG - JITDUMP(" " FMT_BB " %s: move V%02u from ", block->bbNum, insertionPointString, interval->varNum); - JITDUMP("%s to %s", getRegName(fromReg), getRegName(toReg)); + // We should never add resolution move inside BBCallAlwaysPairTail. + noway_assert(!block->isBBCallAlwaysPairTail()); insertMove(block, insertionPoint, interval->varNum, fromReg, toReg); if (fromReg == REG_STK || toReg == REG_STK) @@ -7888,8 +7920,8 @@ void LinearScan::handleOutgoingCriticalEdges(BasicBlock* block) { Interval* interval = getIntervalForLocalVar(edgeVarIndex); assert(interval->isWriteThru); - addResolution(succBlock, insertionPoint, interval, toReg, REG_STK); - JITDUMP(" (EHvar)\n"); + addResolution(succBlock, insertionPoint, interval, toReg, + REG_STK DEBUG_ARG(block) DEBUG_ARG(succBlock) DEBUG_ARG("EHvar")); } } } @@ -7916,7 +7948,7 @@ void LinearScan::handleOutgoingCriticalEdges(BasicBlock* block) // - If this block has a single predecessor that is not the immediately // preceding block, perform any needed 'split' resolution at the beginning of this block // - Otherwise if this block has critical incoming edges, handle them. -// - If this block has a single successor that has multiple predecesors, perform any needed +// - If this block has a single successor that has multiple predecessors, perform any needed // 'join' resolution at the end of this block. // Note that a block may have both 'split' or 'critical' incoming edge(s) and 'join' outgoing // edges. @@ -8266,8 +8298,8 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, regNumber fromReg = getVarReg(fromVarToRegMap, extraVarIndex); if (fromReg != REG_STK) { - addResolution(block, insertionPoint, interval, REG_STK, fromReg); - JITDUMP(" (EH DUMMY)\n"); + addResolution(block, insertionPoint, interval, REG_STK, + fromReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) DEBUG_ARG("EH DUMMY")); setVarReg(fromVarToRegMap, extraVarIndex, REG_STK); } } @@ -8325,9 +8357,10 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, else if (toReg == REG_STK) { // Do the reg to stack moves now - addResolution(block, insertionPoint, interval, REG_STK, fromReg); - JITDUMP(" (%s)\n", - (interval->isWriteThru && (toReg == REG_STK)) ? "EH DUMMY" : resolveTypeName[resolveType]); + addResolution(block, insertionPoint, interval, REG_STK, + fromReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) + DEBUG_ARG((interval->isWriteThru && (toReg == REG_STK)) ? "EH DUMMY" + : resolveTypeName[resolveType])); } else { @@ -8388,8 +8421,8 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, assert(fromReg < REG_STK); Interval* interval = sourceIntervals[sourceReg]; assert(interval != nullptr); - addResolution(block, insertionPoint, interval, targetReg, fromReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, interval, targetReg, + fromReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) DEBUG_ARG(resolveTypeName[resolveType])); sourceIntervals[sourceReg] = nullptr; location[sourceReg] = REG_NA; regMaskTP fromRegMask = genRegMask(fromReg); @@ -8542,8 +8575,9 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, // First, spill "otherInterval" from targetReg to the stack. Interval* otherInterval = sourceIntervals[source[otherTargetReg]]; setIntervalAsSpilled(otherInterval); - addResolution(block, insertionPoint, otherInterval, REG_STK, targetReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, otherInterval, REG_STK, + targetReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) + DEBUG_ARG(resolveTypeName[resolveType])); location[source[otherTargetReg]] = REG_STK; regMaskTP otherTargetRegMask = genRegMask(otherTargetReg); @@ -8552,8 +8586,9 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, targetRegsToDo &= ~otherTargetRegMask; // Now, move the interval that is going to targetReg. - addResolution(block, insertionPoint, sourceIntervals[sourceReg], targetReg, fromReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, sourceIntervals[sourceReg], targetReg, + fromReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) + DEBUG_ARG(resolveTypeName[resolveType])); location[sourceReg] = REG_NA; // Add its "fromReg" to "targetRegsReady", only if: @@ -8592,15 +8627,16 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, assert(genIsValidDoubleReg(tempReg)); addResolutionForDouble(block, insertionPoint, sourceIntervals, location, tempReg, targetReg, - resolveType); + resolveType DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock)); } else #endif // TARGET_ARM { assert(sourceIntervals[targetReg] != nullptr); - addResolution(block, insertionPoint, sourceIntervals[targetReg], tempReg, targetReg); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, sourceIntervals[targetReg], tempReg, + targetReg DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) + DEBUG_ARG(resolveTypeName[resolveType])); location[targetReg] = (regNumberSmall)tempReg; } targetRegsReady |= targetRegMask; @@ -8620,8 +8656,8 @@ void LinearScan::resolveEdge(BasicBlock* fromBlock, Interval* interval = stackToRegIntervals[targetReg]; assert(interval != nullptr); - addResolution(block, insertionPoint, interval, targetReg, REG_STK); - JITDUMP(" (%s)\n", resolveTypeName[resolveType]); + addResolution(block, insertionPoint, interval, targetReg, + REG_STK DEBUG_ARG(fromBlock) DEBUG_ARG(toBlock) DEBUG_ARG(resolveTypeName[resolveType])); } } @@ -9004,8 +9040,9 @@ void RefPosition::dump(LinearScan* linearScan) { printf("[%d]", this->multiRegIdx); } + printf(" "); } - printf(" " FMT_BB " ", this->bbNum); + printf(FMT_BB " ", this->bbNum); printf("regmask="); dumpRegMask(registerAssignment); @@ -9846,7 +9883,7 @@ void LinearScan::dumpLsraAllocationEvent( } else { - printf("%-5s(A) %-4s ", getScoreName(registerScore), getRegName(reg)); + printf("%-5s(R) %-4s ", getScoreName(registerScore), getRegName(reg)); } break; @@ -9922,14 +9959,17 @@ void LinearScan::dumpLsraAllocationEvent( void LinearScan::dumpRegRecordHeader() { printf("The following table has one or more rows for each RefPosition that is handled during allocation.\n" - "The first column provides the basic information about the RefPosition, with its type (e.g. Def,\n" - "Use, Fixd) followed by a '*' if it is a last use, and a 'D' if it is delayRegFree, and then the\n" - "action taken during allocation (e.g. Alloc a new register, or Keep an existing one).\n" + "The columns are: (1) Loc: LSRA location, (2) RP#: RefPosition number, (3) Name, (4) Type (e.g. Def, Use,\n" + "Fixd, Parm, DDef (Dummy Def), ExpU (Exposed Use), Kill) followed by a '*' if it is a last use, and a 'D'\n" + "if it is delayRegFree, (5) Action taken during allocation. Some actions include (a) Alloc a new register,\n" + "(b) Keep an existing register, (c) Spill a register, (d) ReLod (Reload) a register. If an ALL-CAPS name\n" + "such as COVRS is displayed, it is a score name from lsra_score.h, with a trailing '(A)' indicating alloc,\n" + "'(C)' indicating copy, and '(R)' indicating re-use. See dumpLsraAllocationEvent() for details.\n" "The subsequent columns show the Interval occupying each register, if any, followed by 'a' if it is\n" - "active, a 'p' if it is a large vector that has been partially spilled, and 'i'if it is inactive.\n" - "Columns are only printed up to the last modifed register, which may increase during allocation,\n" - "in which case additional columns will appear. \n" - "Registers which are not marked modified have ---- in their column.\n\n"); + "active, 'p' if it is a large vector that has been partially spilled, and 'i' if it is inactive.\n" + "Columns are only printed up to the last modified register, which may increase during allocation,\n" + "in which case additional columns will appear. Registers which are not marked modified have ---- in\n" + "their column.\n\n"); // First, determine the width of each register column (which holds a reg name in the // header, and an interval name in each subsequent row). diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 32d06c9950226..e3833c1d7d668 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -693,10 +693,16 @@ class LinearScan : public LinearScanInterface regNumberSmall* location, regNumber toReg, regNumber fromReg, - ResolveType resolveType); + ResolveType resolveType DEBUG_ARG(BasicBlock* fromBlock) + DEBUG_ARG(BasicBlock* toBlock)); #endif - void addResolution( - BasicBlock* block, GenTree* insertionPoint, Interval* interval, regNumber outReg, regNumber inReg); + + void addResolution(BasicBlock* block, + GenTree* insertionPoint, + Interval* interval, + regNumber outReg, + regNumber inReg DEBUG_ARG(BasicBlock* fromBlock) DEBUG_ARG(BasicBlock* toBlock) + DEBUG_ARG(const char* reason)); void handleOutgoingCriticalEdges(BasicBlock* block); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 9c29ca4c1dc23..4333630327030 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -2110,13 +2110,13 @@ void LinearScan::buildIntervals() printf("-----------------\n"); for (BasicBlock* const block : compiler->Blocks()) { - printf(FMT_BB " use def in out\n", block->bbNum); + printf(FMT_BB "\nuse: ", block->bbNum); dumpConvertedVarSet(compiler, block->bbVarUse); - printf("\n"); + printf("\ndef: "); dumpConvertedVarSet(compiler, block->bbVarDef); - printf("\n"); + printf("\n in: "); dumpConvertedVarSet(compiler, block->bbLiveIn); - printf("\n"); + printf("\nout: "); dumpConvertedVarSet(compiler, block->bbLiveOut); printf("\n"); } @@ -2303,12 +2303,12 @@ void LinearScan::buildIntervals() // For blocks that don't have EHBoundaryIn, we need DummyDefs for cases where "predBlock" isn't // really a predecessor. - // Note that it's possible to have uses of unitialized variables, in which case even the first + // Note that it's possible to have uses of uninitialized variables, in which case even the first // block may require DummyDefs, which we are not currently adding - this means that these variables // will always be considered to be in memory on entry (and reloaded when the use is encountered). // TODO-CQ: Consider how best to tune this. Currently, if we create DummyDefs for uninitialized // variables (which may actually be initialized along the dynamically executed paths, but not - // on all static paths), we wind up with excessive liveranges for some of these variables. + // on all static paths), we wind up with excessive live ranges for some of these variables. if (!blockInfo[block->bbNum].hasEHBoundaryIn) { @@ -2491,7 +2491,7 @@ void LinearScan::buildIntervals() if (!VarSetOps::IsEmpty(compiler, expUseSet)) { - JITDUMP("Exposed uses:"); + JITDUMP("Exposed uses:\n"); VarSetOps::Iter iter(compiler, expUseSet); unsigned varIndex = 0; while (iter.NextElem(&varIndex)) @@ -2503,9 +2503,7 @@ void LinearScan::buildIntervals() RefPosition* pos = newRefPosition(interval, currentLoc, RefTypeExpUse, nullptr, allRegs(interval->registerType)); pos->setRegOptional(true); - JITDUMP(" V%02u", varNum); } - JITDUMP("\n"); } // Clear the "last use" flag on any vars that are live-out from this block. @@ -2518,8 +2516,7 @@ void LinearScan::buildIntervals() LclVarDsc* const varDsc = compiler->lvaGetDesc(varNum); assert(isCandidateVar(varDsc)); RefPosition* const lastRP = getIntervalForLocalVar(varIndex)->lastRefPosition; - // We should be able to assert that lastRP is non-null if it is live-out, but sometimes liveness - // lies. + // We should be able to assert that lastRP is non-null if it is live-out, but sometimes liveness lies. if ((lastRP != nullptr) && (lastRP->bbNum == block->bbNum)) { lastRP->lastUse = false; @@ -2674,6 +2671,10 @@ void LinearScan::validateIntervals() { if (enregisterLocalVars) { + JITDUMP("\n------------\n"); + JITDUMP("REFPOSITIONS DURING VALIDATE INTERVALS (RefPositions per interval)\n"); + JITDUMP("------------\n\n"); + for (unsigned i = 0; i < compiler->lvaTrackedCount; i++) { if (!compiler->lvaGetDescByTrackedIndex(i)->lvLRACandidate) diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index 7b6347e9628ee..05b14ac517610 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -706,7 +706,7 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) #ifdef DEBUG if (verbose) { - printf("\nCSE candidate #%02u, key=", CSEindex); + printf("\nCandidate " FMT_CSE ", key=", CSEindex); if (!Compiler::Is_Shared_Const_CSE(key)) { vnPrint((unsigned)key, 0); @@ -1320,11 +1320,11 @@ void Compiler::optValnumCSE_DataFlow() for (BasicBlock* const block : Blocks()) { - printf(FMT_BB " in gen out\n", block->bbNum); + printf(FMT_BB "\n in: ", block->bbNum); optPrintCSEDataFlowSet(block->bbCseIn); - printf("\n"); + printf("\ngen: "); optPrintCSEDataFlowSet(block->bbCseGen); - printf("\n"); + printf("\nout: "); optPrintCSEDataFlowSet(block->bbCseOut); printf("\n"); } @@ -1726,11 +1726,9 @@ class CSE_Heuristic return codeOptKind; } - // Perform the Initialization step for our CSE Heuristics - // determine the various cut off values to use for - // the aggressive, moderate and conservative CSE promotions - // count the number of enregisterable variables - // determine if the method has a large or huge stack frame. + // Perform the Initialization step for our CSE Heuristics. Determine the various cut off values to use for + // the aggressive, moderate and conservative CSE promotions. Count the number of enregisterable variables. + // Determine if the method has a large or huge stack frame. // void Initialize() { @@ -1820,16 +1818,17 @@ class CSE_Heuristic } } } + #ifdef TARGET_XARCH if (frameSize > 0x080) { // We likely have a large stack frame. // - // On XARCH stack frame displacements can either use a 1-byte or a 4-byte displacement - // with a large franme we will need to use some 4-byte displacements. + // On XARCH stack frame displacements can either use a 1-byte or a 4-byte displacement. + // With a large frame we will need to use some 4-byte displacements. // largeFrame = true; - break; // early out, we don't need to keep increasing frameSize + break; // early out, we don't need to keep increasing frameSize } #elif defined(TARGET_ARM) if (frameSize > 0x0400) @@ -1837,14 +1836,14 @@ class CSE_Heuristic // We likely have a large stack frame. // // Thus we might need to use large displacements when loading or storing - // to CSE LclVars that are not enregistered + // to CSE LclVars that are not enregistered. // On ARM32 this means using rsGetRsvdReg() to hold the large displacement largeFrame = true; } if (frameSize > 0x10000) { hugeFrame = true; - break; // early out, we don't need to keep increasing frameSize + break; // early out, we don't need to keep increasing frameSize } #elif defined(TARGET_ARM64) if (frameSize > 0x1000) @@ -1852,11 +1851,11 @@ class CSE_Heuristic // We likely have a large stack frame. // // Thus we might need to use large displacements when loading or storing - // to CSE LclVars that are not enregistered + // to CSE LclVars that are not enregistered. // On ARM64 this means using rsGetRsvdReg() or R21 to hold the large displacement // largeFrame = true; - break; // early out, we don't need to keep increasing frameSize + break; // early out, we don't need to keep increasing frameSize } #elif defined(TARGET_LOONGARCH64) if (frameSize > 0x7ff) @@ -1864,22 +1863,19 @@ class CSE_Heuristic // We likely have a large stack frame. // // Thus we might need to use large displacements when loading or storing - // to CSE LclVars that are not enregistered - // On LoongArch64 this means using rsGetRsvdReg() to hold the large displacement + // to CSE LclVars that are not enregistered. + // On LoongArch64 this means using rsGetRsvdReg() to hold the large displacement. // largeFrame = true; - break; // early out, we don't need to keep increasing frameSize + break; // early out, we don't need to keep increasing frameSize } #endif } - // Iterate over the sorted list of tracked local variables - // these are the register candidates for LSRA - // We normally vist the LclVar in order of their weighted ref counts - // and our hueristic assumes that the highest weighted ref count - // LclVars will be enregistered and that the lowest weighted ref count - // are likely be allocated in the stack frame. - // The value of enregCount is incremented when we visit a LclVar + // Iterate over the sorted list of tracked local variables. These are the register candidates for LSRA. + // We normally visit the LclVars in order of their weighted ref counts and our heuristic assumes that the + // highest weighted ref count LclVars will be enregistered and that the lowest weighted ref count + // are likely be allocated in the stack frame. The value of enregCount is incremented when we visit a LclVar // that can be enregistered. // for (unsigned trackedIndex = 0; trackedIndex < m_pCompiler->lvaTrackedCount; trackedIndex++) @@ -1899,7 +1895,7 @@ class CSE_Heuristic continue; } - // The enregCount only tracks the uses of integer registers + // enregCount only tracks the uses of integer registers. // // We could track floating point register usage separately // but it isn't worth the additional complexity as floating point CSEs @@ -2311,8 +2307,8 @@ class CSE_Heuristic (def + use) * cost - If we introduce a CSE temp are each definition and - replace the use with a CSE temp then our cost is: + If we introduce a CSE temp at each definition and + replace each use with a CSE temp then our cost is: (def * (cost + cse-def-cost)) + (use * cse-use-cost) @@ -2336,7 +2332,7 @@ class CSE_Heuristic If we want to be aggressive we use 1 as the values for both cse-def-cost and cse-use-cost. - If we believe that the CSE very valuable in terms of weighted ref counts + If we believe that the CSE is very valuable in terms of weighted ref counts such that it would always be enregistered by the register allocator we choose the aggressive use def costs. @@ -2344,7 +2340,7 @@ class CSE_Heuristic such that it could be likely be enregistered by the register allocator we choose the moderate use def costs. - otherwise we choose the conservative use def costs. + Otherwise we choose the conservative use def costs. */ diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index bb7910ef4d38f..e984e6175d029 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -6649,7 +6649,7 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi if (verbose) { printf("\n LONGVARS(%d)=", VarSetOps::Count(this, lvaLongVars)); - lvaDispVarSet(lvaLongVars); + dumpConvertedVarSet(this, lvaLongVars); } #endif pLoopDsc->lpLoopVarCount += VarSetOps::Count(this, loopLongVars); @@ -6661,13 +6661,13 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi if (verbose) { printf("\n USEDEF (%d)=", VarSetOps::Count(this, pLoopDsc->lpVarUseDef)); - lvaDispVarSet(pLoopDsc->lpVarUseDef); + dumpConvertedVarSet(this, pLoopDsc->lpVarUseDef); printf("\n INOUT (%d)=", pLoopDsc->lpVarInOutCount); - lvaDispVarSet(pLoopDsc->lpVarInOut); + dumpConvertedVarSet(this, pLoopDsc->lpVarInOut); printf("\n LOOPVARS(%d)=", pLoopDsc->lpLoopVarCount); - lvaDispVarSet(loopVars); + dumpConvertedVarSet(this, loopVars); printf("\n"); } #endif @@ -6690,10 +6690,10 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi if (verbose) { printf(" INOUT-FP(%d)=", pLoopDsc->lpVarInOutFPCount); - lvaDispVarSet(inOutFPVars); + dumpConvertedVarSet(this, inOutFPVars); printf("\n LOOPV-FP(%d)=", pLoopDsc->lpLoopVarFPCount); - lvaDispVarSet(loopFPVars); + dumpConvertedVarSet(this, loopFPVars); printf("\n"); } @@ -6715,7 +6715,7 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi // or side-effect dependent things. // // We really should consider hoisting from conditionally executed blocks, if they are frequently executed - // and it is safe to evaluate the tree early + // and it is safe to evaluate the tree early. // ArrayStack defExec(getAllocatorLoopHoist()); @@ -7558,7 +7558,7 @@ void Compiler::optHoistLoopBlocks(unsigned loopNum, ArrayStack* blo // TODO-CQ: Ideally, we should be hoisting all the nodes having side-effects in execution // order as well as the ones that don't have side-effects at all. However, currently, we // just restrict hoisting a node(s) (that are children of `comma`) if one of the siblings - // (which is executed before the given node) has side-effects (exceptions). "Descendants + // (which is executed before the given node) has side-effects (exceptions). Descendants // of ancestors might have side-effects and we might hoist nodes past them. This needs // to be addressed properly. bool visitedCurr = false; diff --git a/src/coreclr/jit/rangecheck.cpp b/src/coreclr/jit/rangecheck.cpp index 6b1575ab5f928..b94f124d00328 100644 --- a/src/coreclr/jit/rangecheck.cpp +++ b/src/coreclr/jit/rangecheck.cpp @@ -1202,7 +1202,8 @@ bool RangeCheck::DoesBinOpOverflow(BasicBlock* block, GenTreeOp* binop) return true; } - JITDUMP("Checking bin op overflow %s %s\n", op1Range->ToString(m_pCompiler->getAllocatorDebugOnly()), + JITDUMP("Checking bin op overflow %s %s %s\n", GenTree::OpName(binop->OperGet()), + op1Range->ToString(m_pCompiler->getAllocatorDebugOnly()), op2Range->ToString(m_pCompiler->getAllocatorDebugOnly())); if (binop->OperIs(GT_ADD)) @@ -1462,7 +1463,7 @@ Range RangeCheck::GetRange(BasicBlock* block, GenTree* expr, bool monIncreasing if (m_pCompiler->verbose) { Indent(indent); - JITDUMP("[RangeCheck::GetRange] " FMT_BB, block->bbNum); + JITDUMP("[RangeCheck::GetRange] " FMT_BB " ", block->bbNum); m_pCompiler->gtDispTree(expr); Indent(indent); JITDUMP("{\n", expr); diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 3cf3da086cff7..4a6aeb158117a 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -9201,7 +9201,7 @@ void Compiler::fgValueNumberSimd(GenTreeSIMD* tree) if (isMemoryLoad) { - // Currently the only SIMD operation with MemoryLoad sematics is SIMDIntrinsicInitArray + // Currently the only SIMD operation with MemoryLoad semantics is SIMDIntrinsicInitArray // and it has to be handled specially since it has an optional op2 // assert(tree->GetSIMDIntrinsicId() == SIMDIntrinsicInitArray);