diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 4104bd7d0cab30..62add7f24dd3f9 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -1662,3 +1662,22 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) bbsDstTab[i] = other->bbsDstTab[i]; } } + +//------------------------------------------------------------------------ +// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the +// loop alignment count. +// +// Arguments: +// compiler - Compiler instance +// reason - Reason to print in JITDUMP +// +void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason)) +{ + // Make sure we unmark and count just once. + if (isLoopAlign()) + { + compiler->loopAlignCandidates--; + bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.", bbNum, reason); + } +} diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index ec7918d4aac953..3b8ecf7f0dacfe 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -552,6 +552,7 @@ enum BasicBlockFlags : unsigned __int64 BBF_PATCHPOINT = MAKE_BBFLAG(36), // Block is a patchpoint BBF_HAS_CLASS_PROFILE = MAKE_BBFLAG(37), // BB contains a call needing a class profile BBF_PARTIAL_COMPILATION_PATCHPOINT = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint + BBF_HAS_ALIGN = MAKE_BBFLAG(39), // BB ends with 'align' instruction // The following are sets of flags. @@ -653,11 +654,19 @@ struct BasicBlock : private LIR::Range { return ((bbFlags & BBF_LOOP_HEAD) != 0); } + bool isLoopAlign() const { return ((bbFlags & BBF_LOOP_ALIGN) != 0); } + void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason)); + + bool hasAlign() const + { + return ((bbFlags & BBF_HAS_ALIGN) != 0); + } + #ifdef DEBUG void dspFlags(); // Print the flags unsigned dspCheapPreds(); // Print the predecessors (bbCheapPreds) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 840ad2e8c6091e..4f9cfa00752d6b 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -174,6 +174,7 @@ void CodeGen::genCodeForBBlist() for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext) { + #ifdef DEBUG if (compiler->verbose) { @@ -782,21 +783,29 @@ void CodeGen::genCodeForBBlist() } #if FEATURE_LOOP_ALIGN + if (block->hasAlign()) + { + // If this block has 'align' instruction in the end (identified by BBF_HAS_ALIGN), + // then need to add align instruction in the current "block". + // + // For non-adaptive alignment, add alignment instruction of size depending on the + // compJitAlignLoopBoundary. + // For adaptive alignment, alignment instruction will always be of 15 bytes for xarch + // and 16 bytes for arm64. + assert(ShouldAlignLoops()); - // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN), - // then need to add align instruction in current "block". Also mark the - // corresponding IG with IGF_LOOP_ALIGN to know that there will be align - // instructions at the end of that IG. - // - // For non-adaptive alignment, add alignment instruction of size depending on the - // compJitAlignLoopBoundary. - // For adaptive alignment, alignment instruction will always be of 15 bytes. + GetEmitter()->emitLoopAlignment(DEBUG_ARG1(block->bbJumpKind == BBJ_ALWAYS)); + } if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) { - assert(ShouldAlignLoops()); - - GetEmitter()->emitLoopAlignment(); + if (compiler->opts.compJitHideAlignBehindJmp) + { + // The current IG is the one that is just before the IG having loop start. + // Establish a connection of recent align instruction emitted to the loop + // it actually is aligning using 'idaLoopHeadPredIG'. + GetEmitter()->emitConnectAlignInstrWithCurIG(); + } } #endif diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 6d5745729ecaf0..01ff6587b4c152 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2547,11 +2547,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1; opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize(); + opts.compJitHideAlignBehindJmp = JitConfig.JitHideAlignBehindJmp() == 1; #else opts.compJitAlignLoopAdaptive = true; opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY; opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT; opts.compJitAlignLoopMaxCodeSize = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN; + opts.compJitHideAlignBehindJmp = true; #endif #ifdef TARGET_XARCH @@ -5152,6 +5154,11 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl fgDebugCheckLinks(); #endif +#if FEATURE_LOOP_ALIGN + // Place loop alignment instructions + DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::placeLoopAlignInstructions); +#endif + // Generate code codeGen->genGenerateCode(methodCodePtr, methodCodeSize); @@ -5208,6 +5215,82 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #endif // FUNC_INFO_LOGGING } +#if FEATURE_LOOP_ALIGN + +//------------------------------------------------------------------------ +// placeLoopAlignInstructions: Iterate over all the blocks and determine +// the best position to place the 'align' instruction. Inserting 'align' +// instructions after an unconditional branch is preferred over inserting +// in the block before the loop. In case there are multiple blocks +// having 'jmp', the one that has lower weight is preferred. +// If the block having 'jmp' is hotter than the block before the loop, +// the align will still be placed after 'jmp' because the processor should +// be smart enough to not fetch extra instruction beyond jmp. +// +void Compiler::placeLoopAlignInstructions() +{ + if (loopAlignCandidates == 0) + { + return; + } + + int loopsToProcess = loopAlignCandidates; + + // Add align only if there were any loops that needed alignment + weight_t minBlockSoFar = BB_MAX_WEIGHT; + BasicBlock* bbHavingAlign = nullptr; + for (BasicBlock* const block : Blocks()) + { + if ((block == fgFirstBB) && block->isLoopAlign()) + { + // Adding align instruction in prolog is not supported + // hence skip the align block if it is the first block. + loopsToProcess--; + continue; + } + + // If there is a unconditional jump + if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS)) + { + if (block->bbWeight < minBlockSoFar) + { + minBlockSoFar = block->bbWeight; + bbHavingAlign = block; + JITDUMP(FMT_BB ", bbWeight=" FMT_WT " ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight); + } + } + + if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) + { + // If jmp was not found, then block before the loop start is where align instruction will be added. + if (bbHavingAlign == nullptr) + { + bbHavingAlign = block; + JITDUMP("Marking " FMT_BB " before the loop with BBF_HAS_ALIGN for loop at " FMT_BB "\n", block->bbNum, + block->bbNext->bbNum); + } + else + { + JITDUMP("Marking " FMT_BB " that ends with unconditional jump with BBF_HAS_ALIGN for loop at " FMT_BB + "\n", + bbHavingAlign->bbNum, block->bbNext->bbNum); + } + + bbHavingAlign->bbFlags |= BBF_HAS_ALIGN; + minBlockSoFar = BB_MAX_WEIGHT; + bbHavingAlign = nullptr; + + if (--loopsToProcess == 0) + { + break; + } + } + } + + assert(loopsToProcess == 0); +} +#endif + //------------------------------------------------------------------------ // generatePatchpointInfo: allocate and fill in patchpoint info data, // and report it to the VM diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 51e07b137527be..d99d3affdf5d43 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3666,6 +3666,7 @@ class Compiler #endif BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind); + void placeLoopAlignInstructions(); /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -6871,13 +6872,13 @@ class Compiler bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability public: - LoopDsc* optLoopTable; // loop descriptor table - unsigned char optLoopCount; // number of tracked loops + LoopDsc* optLoopTable; // loop descriptor table + unsigned char optLoopCount; // number of tracked loops + unsigned char loopAlignCandidates; // number of loops identified for alignment #ifdef DEBUG - unsigned char loopAlignCandidates; // number of loops identified for alignment - unsigned char loopsAligned; // number of loops actually aligned -#endif // DEBUG + unsigned char loopsAligned; // number of loops actually aligned +#endif // DEBUG bool optRecordLoop(BasicBlock* head, BasicBlock* top, @@ -9687,6 +9688,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // If set, perform adaptive loop alignment that limits number of padding based on loop size. bool compJitAlignLoopAdaptive; + // If set, tries to hide alignment instructions behind unconditional jumps. + bool compJitHideAlignBehindJmp; + #ifdef LATE_DISASM bool doLateDisasm; // Run the late disassembler #endif // LATE_DISASM diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index bb992440f8e3d4..b8805fca753529 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -87,6 +87,7 @@ CompPhaseNameMacro(PHASE_INSERT_GC_POLLS, "Insert GC Polls", CompPhaseNameMacro(PHASE_DETERMINE_FIRST_COLD_BLOCK, "Determine first cold block", "COLD-BLK", false, -1, true) CompPhaseNameMacro(PHASE_RATIONALIZE, "Rationalize IR", "RAT", false, -1, false) CompPhaseNameMacro(PHASE_SIMPLE_LOWERING, "Do 'simple' lowering", "SMP-LWR", false, -1, false) +CompPhaseNameMacro(PHASE_ALIGN_LOOPS, "Place 'align' instructions", "LOOP-ALIGN", false, -1, false) CompPhaseNameMacro(PHASE_LCLVARLIVENESS, "Local var liveness", "LIVENESS", true, -1, false) CompPhaseNameMacro(PHASE_LCLVARLIVENESS_INIT, "Local var liveness init", "LIV-INIT", false, PHASE_LCLVARLIVENESS, false) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 54d57dd22b2f7d..f81de138b8bcb6 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -872,7 +872,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) // Move align instructions to the global list, update their 'next' links do { - // Grab the jump and remove it from the list + // Grab the align and remove it from the list instrDescAlign* oa = emitCurIGAlignList; emitCurIGAlignList = oa->idaNext; @@ -913,6 +913,14 @@ insGroup* emitter::emitSavIG(bool emitAdd) } emitAlignLast = last; + + // Point to the first instruction of most recent + // align instruction(s) added. + // + // Since emitCurIGAlignList is created in inverse of + // program order, the `list` reverses that in forms it + // in correct order. + emitAlignLastGroup = list; } #endif @@ -1071,8 +1079,8 @@ void emitter::emitBegFN(bool hasFramePtr #if FEATURE_LOOP_ALIGN /* We don't have any align instructions */ - emitAlignList = emitAlignLast = nullptr; - emitCurIGAlignList = nullptr; + emitAlignList = emitAlignLastGroup = emitAlignLast = nullptr; + emitCurIGAlignList = nullptr; #endif /* We have not recorded any live sets */ @@ -1401,7 +1409,7 @@ void* emitter::emitAllocAnyInstr(size_t sz, emitAttr opsz) // the prolog/epilog placeholder groups ARE generated in order, and are // re-used. But generating additional groups would not work. if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && emitCurIGinsCnt && !emitIGisInProlog(emitCurIG) && - !emitIGisInEpilog(emitCurIG) && !emitCurIG->isLoopAlign() + !emitIGisInEpilog(emitCurIG) && !emitCurIG->endsWithAlignInstr() #if defined(FEATURE_EH_FUNCLETS) && !emitIGisInFuncletProlog(emitCurIG) && !emitIGisInFuncletEpilog(emitCurIG) #endif // FEATURE_EH_FUNCLETS @@ -1574,7 +1582,7 @@ void emitter::emitCheckIGoffsets() { if (tempIG->igOffs != currentOffset) { - printf("Block #%u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset); + printf("IG%02u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset); assert(!"bad block offset"); } @@ -3555,7 +3563,7 @@ void emitter::emitDispIGflags(unsigned flags) { printf(", extend"); } - if (flags & IGF_LOOP_ALIGN) + if (flags & IGF_HAS_ALIGN) { printf(", align"); } @@ -4804,9 +4812,9 @@ void emitter::emitJumpDistBind() // Arguments: // nAlignInstr - Number of align instructions about to be added. // -void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr) +void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) { - unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign); + unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); // Ensure that all align instructions fall in same IG. if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) @@ -4817,11 +4825,15 @@ void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr) //----------------------------------------------------------------------------- // -// The next instruction will be a loop head entry point -// So insert an alignment instruction here to ensure that -// we can properly align the code. +// emitLoopAlign: The next instruction will be a loop head entry point +// So insert an alignment instruction of "paddingBytes" to ensure that +// the code is properly aligned. +// Arguments: +// paddingBytes - Number of padding bytes to insert. +// isFirstAlign - For multiple 'align' instructions case, if this is the first +// 'align' instruction of that group. // -void emitter::emitLoopAlign(unsigned short paddingBytes) +void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)) { // Determine if 'align' instruction about to be generated will // fall in current IG or next. @@ -4831,7 +4843,7 @@ void emitter::emitLoopAlign(unsigned short paddingBytes) { // If align fits in current IG, then mark that it contains alignment // instruction in the end. - emitCurIG->igFlags |= IGF_LOOP_ALIGN; + emitCurIG->igFlags |= IGF_HAS_ALIGN; } /* Insert a pseudo-instruction to ensure that we align @@ -4842,12 +4854,12 @@ void emitter::emitLoopAlign(unsigned short paddingBytes) { // Mark this IG has alignment in the end, so during emitter we can check the instruction count // heuristics of all IGs that follows this IG that participate in a loop. - emitCurIG->igFlags |= IGF_LOOP_ALIGN; + emitCurIG->igFlags |= IGF_HAS_ALIGN; } else { // Otherwise, make sure it was already marked such. - assert(emitCurIG->isLoopAlign()); + assert(emitCurIG->endsWithAlignInstr()); } #if defined(TARGET_XARCH) @@ -4859,6 +4871,22 @@ void emitter::emitLoopAlign(unsigned short paddingBytes) id->idaIG = emitCurIG; + if (isFirstAlign) + { + // For multiple align instructions, set the idaLoopHeadPredIG only for the + // first align instruction + id->idaLoopHeadPredIG = emitCurIG; + emitAlignLastGroup = id; + } + else + { + id->idaLoopHeadPredIG = nullptr; + } + +#ifdef DEBUG + id->isPlacedAfterJmp = isPlacedBehindJmp; +#endif + /* Append this instruction to this IG's alignment list */ id->idaNext = emitCurIGAlignList; @@ -4870,25 +4898,28 @@ void emitter::emitLoopAlign(unsigned short paddingBytes) //----------------------------------------------------------------------------- // -// The next instruction will be a loop head entry point +// emitLongLoopAlign: The next instruction will be a loop head entry point // So insert alignment instruction(s) here to ensure that // we can properly align the code. // // This emits more than one `INS_align` instruction depending on the // alignmentBoundary parameter. // -void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) +// Arguments: +// alignmentBoundary - The boundary at which loop needs to be aligned. +// +void emitter::emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)) { #if defined(TARGET_XARCH) - unsigned short nPaddingBytes = alignmentBoundary - 1; - unsigned short nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; - unsigned short insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; - unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; - unsigned short paddingBytes = MAX_ENCODED_SIZE; + unsigned nPaddingBytes = alignmentBoundary - 1; + unsigned nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; + unsigned insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; + unsigned lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; + unsigned paddingBytes = MAX_ENCODED_SIZE; #elif defined(TARGET_ARM64) - unsigned short nAlignInstr = alignmentBoundary / INSTR_ENCODED_SIZE; - unsigned short insAlignCount = nAlignInstr; - unsigned short paddingBytes = INSTR_ENCODED_SIZE; + unsigned nAlignInstr = alignmentBoundary / INSTR_ENCODED_SIZE; + unsigned insAlignCount = nAlignInstr; + unsigned paddingBytes = INSTR_ENCODED_SIZE; #endif emitCheckAlignFitInCurIG(nAlignInstr); @@ -4896,25 +4927,49 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) /* Insert a pseudo-instruction to ensure that we align the next instruction properly */ + bool isFirstAlign = true; while (insAlignCount) { - emitLoopAlign(paddingBytes); + emitLoopAlign(paddingBytes, isFirstAlign DEBUG_ARG(isPlacedBehindJmp)); insAlignCount--; + isFirstAlign = false; } #if defined(TARGET_XARCH) - emitLoopAlign(lastInsAlignSize); + emitLoopAlign(lastInsAlignSize, isFirstAlign DEBUG_ARG(isPlacedBehindJmp)); #endif } +//----------------------------------------------------------------------------- +// emitConnectAlignInstrWithCurIG: If "align" instruction is not just before the loop start, +// setting idaLoopHeadPredIG lets us know the exact IG that the "align" +// instruction is trying to align. This is used to track the last IG that +// needs alignment after which VEX encoding optimization is enabled. +// +// TODO: Once over-estimation problem is solved, consider replacing +// idaLoopHeadPredIG with idaLoopHeadIG itself. +// +void emitter::emitConnectAlignInstrWithCurIG() +{ + JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitAlignLastGroup->idaIG->igNum, + emitCurIG->igNum); + // Since we never align overlapping instructions, it is always guaranteed that + // the emitAlignLastGroup points to the loop that is in process of getting aligned. + + emitAlignLastGroup->idaLoopHeadPredIG = emitCurIG; + + // For a new IG to ensure that loop doesn't start from IG that idaLoopHeadPredIG points to. + emitNxtIG(); +} + //----------------------------------------------------------------------------- // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and -// mark it as IGF_LOOP_ALIGN to indicate that next IG is a -// loop needing alignment. +// mark it as IGF_HAS_ALIGN to indicate that a next or a future +// IG is a loop that needs alignment. // -void emitter::emitLoopAlignment() +void emitter::emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)) { - unsigned short paddingBytes; + unsigned paddingBytes; #if defined(TARGET_XARCH) // For xarch, each align instruction can be maximum of MAX_ENCODED_SIZE bytes and if @@ -4922,13 +4977,13 @@ void emitter::emitLoopAlignment() if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive)) { paddingBytes = emitComp->opts.compJitAlignLoopBoundary; - emitLongLoopAlign(paddingBytes); + emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp)); } else { emitCheckAlignFitInCurIG(1); paddingBytes = MAX_ENCODED_SIZE; - emitLoopAlign(paddingBytes); + emitLoopAlign(paddingBytes, true DEBUG_ARG(isPlacedBehindJmp)); } #elif defined(TARGET_ARM64) // For Arm64, each align instruction is 4-bytes long because of fixed-length encoding. @@ -4941,14 +4996,12 @@ void emitter::emitLoopAlignment() { paddingBytes = emitComp->opts.compJitAlignLoopBoundary; } - emitLongLoopAlign(paddingBytes); + emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp)); #endif - JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); + assert(emitLastIns->idIns() == INS_align); -#ifdef DEBUG - emitComp->loopAlignCandidates++; -#endif // DEBUG + JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); } //----------------------------------------------------------------------------- @@ -4958,7 +5011,7 @@ void emitter::emitLoopAlignment() // bool emitter::emitEndsWithAlignInstr() { - return emitCurIG->isLoopAlign(); + return emitCurIG->endsWithAlignInstr(); } //----------------------------------------------------------------------------- @@ -4982,9 +5035,13 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) { loopSize += igInLoop->igSize; - if (igInLoop->isLoopAlign()) + if (igInLoop->endsWithAlignInstr()) { - // If igInLoop is marked as "IGF_LOOP_ALIGN", the basic block flow detected a loop start. + // If IGF_HAS_ALIGN is present, igInLoop contains align instruction at the end, + // for next IG or some future IG. + // + // For both cases, remove the padding bytes from igInLoop's size so it is not included in loopSize. + // // If the loop was formed because of forward jumps like the loop IG18 below, the backedge is not // set for them and such loops are not aligned. For such cases, the loop size threshold will never // be met and we would break as soon as loopSize > maxLoopSize. @@ -4997,9 +5054,9 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG // ... // jne IG05 // - // If igInLoop is a legitimate loop, and igInLoop's next IG is also a loop that needs alignment, - // then igInLoop should be the last IG of the current loop and should have backedge to current - // loop header. + // If igInLoop is a legitimate loop, and igInLoop's end with another 'align' instruction for different IG + // representing a loop that needs alignment, then igInLoop should be the last IG of the current loop and + // should have backedge to current loop header. // // Below, IG05 is the last IG of loop IG04-IG05 and its backedge points to IG04. // @@ -5149,25 +5206,33 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) bool markedCurrLoop = alignCurrentLoop; while ((alignInstr != nullptr)) { - // Find the IG before current loop and clear the IGF_LOOP_ALIGN flag - if (!alignCurrentLoop && (alignInstr->idaIG->igNext == dstIG)) + insGroup* loopHeadIG = alignInstr->loopHeadIG(); + + // Find the IG that has 'align' instruction to align the current loop + // and clear the IGF_HAS_ALIGN flag. + if (!alignCurrentLoop && (loopHeadIG == dstIG)) { assert(!markedCurrLoop); - alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; + + // This IG should no longer contain alignment instruction + alignInstr->removeAlignFlags(); + markedCurrLoop = true; JITDUMP("** Skip alignment for current loop IG%02u ~ IG%02u because it encloses an aligned loop " "IG%02u ~ IG%02u.\n", currLoopStart, currLoopEnd, emitLastLoopStart, emitLastLoopEnd); } - // Find the IG before the last loop and clear the IGF_LOOP_ALIGN flag - if (!alignLastLoop && (alignInstr->idaIG->igNext != nullptr) && - (alignInstr->idaIG->igNext->igNum == emitLastLoopStart)) + // Find the IG that has 'align' instruction to align the last loop + // and clear the IGF_HAS_ALIGN flag. + if (!alignLastLoop && (loopHeadIG != nullptr) && (loopHeadIG->igNum == emitLastLoopStart)) { assert(!markedLastLoop); - assert(alignInstr->idaIG->isLoopAlign()); + assert(alignInstr->idaIG->endsWithAlignInstr()); + + // This IG should no longer contain alignment instruction + alignInstr->removeAlignFlags(); - alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; markedLastLoop = true; JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop " "IG%02u ~ IG%02u.\n", @@ -5179,21 +5244,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) break; } -#if defined(TARGET_XARCH) - if (!emitComp->opts.compJitAlignLoopAdaptive) -#endif - { - // If there are multiple align instructions, skip the align instructions after - // the first align instruction and fast forward to the next IG - insGroup* alignIG = alignInstr->idaIG; - while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && - (alignInstr->idaNext->idaIG == alignIG)) - { - alignInstr = alignInstr->idaNext; - } - } - - alignInstr = alignInstr->idaNext; + alignInstr = emitAlignInNextIG(alignInstr); } assert(markedLastLoop && markedCurrLoop); @@ -5232,21 +5283,32 @@ void emitter::emitLoopAlignAdjustments() unsigned loopIGOffset = 0; instrDescAlign* alignInstr = emitAlignList; - for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext) + for (; alignInstr != nullptr;) { assert(alignInstr->idIns() == INS_align); - insGroup* alignIG = alignInstr->idaIG; + insGroup* loopHeadPredIG = alignInstr->idaLoopHeadPredIG; + insGroup* loopHeadIG = alignInstr->loopHeadIG(); + insGroup* containingIG = alignInstr->idaIG; + + JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, + loopHeadIG->igNum); - loopIGOffset = alignIG->igOffs + alignIG->igSize; + // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure + // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved" + + loopIGOffset = loopHeadIG->igOffs - alignBytesRemoved; // igSize also includes INS_align instruction, take it off. loopIGOffset -= estimatedPaddingNeeded; // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected // that the igLoopBackEdge encloses an IG that is marked for alignment. + unsigned actualPaddingNeeded = - alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0; + containingIG->endsWithAlignInstr() + ? emitCalculatePaddingForLoopAlignment(loopHeadIG, loopIGOffset DEBUG_ARG(false)) + : 0; assert(estimatedPaddingNeeded >= actualPaddingNeeded); @@ -5254,15 +5316,15 @@ void emitter::emitLoopAlignAdjustments() if (diff != 0) { - alignIG->igSize -= diff; + containingIG->igSize -= diff; alignBytesRemoved += diff; emitTotalCodeSize -= diff; // Update the flags - alignIG->igFlags |= IGF_UPD_ISZ; + containingIG->igFlags |= IGF_UPD_ISZ; if (actualPaddingNeeded == 0) { - alignIG->igFlags &= ~IGF_LOOP_ALIGN; + alignInstr->removeAlignFlags(); } #ifdef TARGET_XARCH @@ -5312,21 +5374,19 @@ void emitter::emitLoopAlignAdjustments() } assert(paddingToAdj == 0); assert(instrAdjusted == 0); - - // fast forward the align instruction to next IG - alignInstr = prevAlignInstr; } - JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, + JITDUMP("Adjusted alignment for %s from %u to %u.\n", emitLabelString(loopHeadIG), estimatedPaddingNeeded, actualPaddingNeeded); - JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff), - alignIG->igSize); + JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(containingIG), + (containingIG->igSize + diff), containingIG->igSize); } // Adjust the offset of all IGs starting from next IG until we reach the IG having the next // align instruction or the end of IG list. - insGroup* adjOffIG = alignIG->igNext; - insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; + insGroup* adjOffIG = containingIG->igNext; + instrDescAlign* nextAlign = emitAlignInNextIG(alignInstr); + insGroup* adjOffUptoIG = nextAlign != nullptr ? nextAlign->idaIG : emitIGlast; while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) { JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs, @@ -5335,11 +5395,14 @@ void emitter::emitLoopAlignAdjustments() adjOffIG = adjOffIG->igNext; } + alignInstr = nextAlign; + if (actualPaddingNeeded > 0) { - // Record the last IG that has align instruction. No overestimation + // Record the last loop IG that will be aligned. No overestimation // adjustment will be done after emitLastAlignedIgNum. - emitLastAlignedIgNum = alignIG->igNum; + JITDUMP("Recording last aligned IG: %s\n", emitLabelString(loopHeadPredIG)); + emitLastAlignedIgNum = loopHeadPredIG->igNum; } } @@ -5353,7 +5416,7 @@ void emitter::emitLoopAlignAdjustments() // end of 'ig' so the loop that starts after 'ig' is aligned. // // Arguments: -// ig - The IG having 'align' instruction in the end. +// loopHeadIG - The IG that has the loop head that need to be aligned. // offset - The offset at which the IG that follows 'ig' starts. // isAlignAdjusted - Determine if adjustments are done to the align instructions or not. // During generating code, it is 'false' (because we haven't adjusted the size yet). @@ -5381,15 +5444,15 @@ void emitter::emitLoopAlignAdjustments() // 3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned // 3c. return paddingNeeded. // -unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)) +unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG, + size_t offset DEBUG_ARG(bool isAlignAdjusted)) { - assert(ig->isLoopAlign()); unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; // No padding if loop is already aligned if ((offset & (alignmentBoundary - 1)) == 0) { - JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(ig->igNext), + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(loopHeadIG), alignmentBoundary); return 0; } @@ -5409,12 +5472,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize; } - unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize DEBUG_ARG(isAlignAdjusted)); + unsigned loopSize = getLoopSize(loopHeadIG, maxLoopSize DEBUG_ARG(isAlignAdjusted)); // No padding if loop is big if (loopSize > maxLoopSize) { - JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(ig->igNext), + JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(loopHeadIG), loopSize, maxLoopSize); return 0; } @@ -5451,7 +5514,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { skipPadding = true; JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %uB boundary.'\n", - emitLabelString(ig->igNext), alignmentBoundary); + emitLabelString(loopHeadIG), alignmentBoundary); } // Check if the alignment exceeds new maxPadding limit else if (nPaddingBytes > nMaxPaddingBytes) @@ -5459,7 +5522,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs skipPadding = true; JITDUMP(";; Skip alignment: 'Loop at %s PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " "AlignmentBoundary= %dB.'\n", - emitLabelString(ig->igNext), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); + emitLabelString(loopHeadIG), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); } } @@ -5482,7 +5545,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", - emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); + emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary); } } } @@ -5511,12 +5574,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", - emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); + emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary); } } JITDUMP(";; Calculated padding to add %d bytes to align %s at %dB boundary.\n", paddingToAdd, - emitLabelString(ig->igNext), alignmentBoundary); + emitLabelString(loopHeadIG), alignmentBoundary); // Either no padding is added because it is too expensive or the offset gets aligned // to the alignment boundary @@ -5525,6 +5588,25 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs return paddingToAdd; } +// emitAlignInNextIG: On xarch, for adaptive alignment, this will usually return the next instruction in +// 'emitAlignList'. But for arm64 or non-adaptive alignment on xarch, where multiple +// align instructions are emitted, this method will skip the 'align' instruction present +// in the same IG and return the first instruction that is present in next IG. +// Arguments: +// alignInstr - Current 'align' instruction for which next IG's first 'align' should be returned. +// +emitter::instrDescAlign* emitter::emitAlignInNextIG(instrDescAlign* alignInstr) +{ + // If there are multiple align instructions, skip the align instructions after + // the first align instruction and fast forward to the next IG + insGroup* alignIG = alignInstr->idaIG; + while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && (alignInstr->idaNext->idaIG == alignIG)) + { + alignInstr = alignInstr->idaNext; + } + return alignInstr != nullptr ? alignInstr->idaNext : nullptr; +} + #endif // FEATURE_LOOP_ALIGN void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 22e9cfd6c79e09..c851df9df80a1a 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -275,8 +275,8 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. -#define IGF_LOOP_ALIGN 0x0400 // this group contains alignment instruction(s) at the end; the next IG is the - // head of a loop that needs alignment. +#define IGF_HAS_ALIGN 0x0400 // this group contains an alignment instruction(s) at the end to align either the next + // IG, or, if this IG contains with an unconditional branch, some subsequent IG. // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -349,9 +349,9 @@ struct insGroup return *(unsigned*)ptr; } - bool isLoopAlign() + bool endsWithAlignInstr() const { - return (igFlags & IGF_LOOP_ALIGN) != 0; + return (igFlags & IGF_HAS_ALIGN) != 0; } }; // end of struct insGroup @@ -1383,14 +1383,29 @@ class emitter #if FEATURE_LOOP_ALIGN struct instrDescAlign : instrDesc { - instrDescAlign* idaNext; // next align in the group/method - insGroup* idaIG; // containing group - }; + instrDescAlign* idaNext; // next align in the group/method + insGroup* idaIG; // containing group + insGroup* idaLoopHeadPredIG; // The IG before the loop IG. + // If no 'jmp' instructions were found until idaLoopHeadPredIG, + // then idaLoopHeadPredIG == idaIG. +#ifdef DEBUG + bool isPlacedAfterJmp; // Is the 'align' instruction placed after jmp. Used to decide + // if the instruction cost should be included in PerfScore + // calculation or not. +#endif - void emitCheckAlignFitInCurIG(unsigned short nAlignInstr); - void emitLoopAlign(unsigned short paddingBytes); - void emitLongLoopAlign(unsigned short alignmentBoundary); + inline insGroup* loopHeadIG() + { + assert(idaLoopHeadPredIG); + return idaLoopHeadPredIG->igNext; + } + void removeAlignFlags() + { + idaIG->igFlags &= ~IGF_HAS_ALIGN; + } + }; + void emitCheckAlignFitInCurIG(unsigned nAlignInstr); #endif // FEATURE_LOOP_ALIGN #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT. @@ -1780,15 +1795,26 @@ class emitter unsigned emitLastLoopStart; // Start IG of last inner loop unsigned emitLastLoopEnd; // End IG of last inner loop unsigned emitLastAlignedIgNum; // last IG that has align instruction - instrDescAlign* emitAlignList; // list of local align instructions in method + instrDescAlign* emitAlignList; // list of all align instructions in method instrDescAlign* emitAlignLast; // last align instruction in method + + // Points to the most recent added align instruction. If there are multiple align instructions like in arm64 or + // non-adaptive alignment on xarch, this points to the first align instruction of the series of align instructions. + instrDescAlign* emitAlignLastGroup; + unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size - void emitLoopAlignment(); + void emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)); bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate void emitSetLoopBackEdge(BasicBlock* loopTopBlock); void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)); + + void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)); + void emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)); + instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); + void emitConnectAlignInstrWithCurIG(); + #endif void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 1ae7aa873a0aef..3de8faafc13c6d 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11439,13 +11439,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // IG can be marked as not needing alignment after emitting align instruction. // Alternatively, there are fewer align instructions needed than emitted. // If that is the case, skip outputting alignment. - if (!ig->isLoopAlign() || id->idIsEmptyAlign()) + if (!ig->endsWithAlignInstr() || id->idIsEmptyAlign()) { skipIns = true; } #ifdef DEBUG - if (!ig->isLoopAlign()) + if (!ig->endsWithAlignInstr()) { // Validate if the state is correctly updated assert(id->idIsEmptyAlign()); @@ -11453,6 +11453,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #endif sz = sizeof(instrDescAlign); ins = INS_nop; + +#ifdef DEBUG + // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction, + // then add "bkpt" instruction. + instrDescAlign* alignInstr = (instrDescAlign*)id; + + if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && + (alignInstr->idaIG != alignInstr->idaLoopHeadPredIG) && !skipIns) + { + // There is no good way to squeeze in "bkpt" as well as display it + // in the disassembly because there is no corresponding instrDesc for + // it. As such, leave it as is, the "0xD43E0000" bytecode will be seen + // next to the nop instruction in disasm. + // e.g. D43E0000 align [4 bytes for IG07] + ins = INS_BREAKPOINT; + } +#endif } #endif // FEATURE_LOOP_ALIGN @@ -13330,7 +13347,15 @@ void emitter::emitDispIns( case IF_SN_0A: // SN_0A ................ ................ if (ins == INS_align) { - printf("[%d bytes]", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE); + instrDescAlign* alignInstrId = (instrDescAlign*)id; + printf("[%d bytes", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE); + + // targetIG is only set for 1st of the series of align instruction + if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr)) + { + printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum); + } + printf("]"); } break; @@ -14582,11 +14607,19 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_SN_0A: // bkpt, brk, nop - if (id->idIsEmptyAlign()) + + if (id->idIns() == INS_align) { - // We're not going to generate any instruction, so it doesn't count for PerfScore. - result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; - result.insLatency = PERFSCORE_LATENCY_ZERO; + if ((id->idInsOpt() == INS_OPTS_NONE) || ((instrDescAlign*)id)->isPlacedAfterJmp) + { + // Either we're not going to generate 'align' instruction, or the 'align' + // instruction is placed immediately after unconditional jmp. + // In both cases, don't count for PerfScore. + + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; + result.insLatency = PERFSCORE_LATENCY_ZERO; + break; + } } else if (ins == INS_yield) { @@ -14594,11 +14627,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_140C; result.insLatency = PERFSCORE_LATENCY_140C; } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_ZERO; - } + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_ZERO; break; case IF_SI_0B: // dmb, dsb, isb diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 00c403d08768d9..8246f2de377fb2 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -9830,7 +9830,14 @@ void emitter::emitDispIns( #if FEATURE_LOOP_ALIGN if (ins == INS_align) { - printf("[%d bytes]", id->idCodeSize()); + instrDescAlign* alignInstrId = (instrDescAlign*)id; + printf("[%d bytes", alignInstrId->idCodeSize()); + // targetIG is only set for 1st of the series of align instruction + if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr)) + { + printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum); + } + printf("]"); } #endif break; @@ -10018,32 +10025,74 @@ static BYTE* emitOutputNOP(BYTE* dstRW, size_t nBytes) // BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) { + instrDescAlign* alignInstr = (instrDescAlign*)id; + +#ifdef DEBUG + // For cases where 'align' was placed behing a 'jmp' in an IG that does not + // immediately preced the loop IG, we do not know in advance the offset of + // IG having loop. For such cases, skip the padding calculation validation. + bool validatePadding = alignInstr->idaIG == alignInstr->idaLoopHeadPredIG; +#endif + // Candidate for loop alignment assert(codeGen->ShouldAlignLoops()); - assert(ig->isLoopAlign()); + assert(ig->endsWithAlignInstr()); unsigned paddingToAdd = id->idCodeSize(); // Either things are already aligned or align them here. - assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); + assert(!validatePadding || (paddingToAdd == 0) || + (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); // Padding amount should not exceed the alignment boundary assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary); #ifdef DEBUG - unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, true); - - // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking - if (emitComp->opts.compJitAlignLoopAdaptive) + if (validatePadding) { - assert(paddingToAdd == paddingNeeded); + unsigned paddingNeeded = + emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG->igNext, (size_t)dst, true); + + // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(paddingToAdd == paddingNeeded); + } } emitComp->loopsAligned++; #endif BYTE* dstRW = dst + writeableOffset; - dstRW = emitOutputNOP(dstRW, paddingToAdd); + +#ifdef DEBUG + // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction, + // then add "int3" instruction. Since int3 takes 1 byte, we would only add + // it if paddingToAdd >= 1 byte. + + if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && !validatePadding && paddingToAdd >= 1) + { + size_t int3Code = insCodeMR(INS_BREAKPOINT); + // There is no good way to squeeze in "int3" as well as display it + // in the disassembly because there is no corresponding instrDesc for + // it. As such, leave it as is, the "0xCC" bytecode will be seen next + // to the nop instruction in disasm. + // e.g. CC align [1 bytes for IG29] + // + // if (emitComp->opts.disAsm) + //{ + // emitDispInsAddr(dstRW); + + // emitDispInsOffs(0, false); + + // printf(" %-9s ; stress-mode injected interrupt\n", "int3"); + //} + dstRW += emitOutputByte(dstRW, int3Code); + paddingToAdd -= 1; + } +#endif + + dstRW = emitOutputNOP(dstRW, paddingToAdd); return dstRW - writeableOffset; } @@ -13373,7 +13422,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) sz = sizeof(instrDescAlign); // IG can be marked as not needing alignment after emitting align instruction // In such case, skip outputting alignment. - if (ig->isLoopAlign()) + if (ig->endsWithAlignInstr()) { dst = emitOutputAlign(ig, id, dst); } @@ -14823,9 +14872,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { case INS_align: #if FEATURE_LOOP_ALIGN - if (id->idCodeSize() == 0) + if ((id->idCodeSize() == 0) || ((instrDescAlign*)id)->isPlacedAfterJmp) { - // We're not going to generate any instruction, so it doesn't count for PerfScore. + // Either we're not going to generate 'align' instruction, or the 'align' + // instruction is placed immediately after unconditional jmp. + // In both cases, don't count for PerfScore. + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; result.insLatency = PERFSCORE_LATENCY_ZERO; break; diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 0f4631afd11ad1..04cf8cdf3cec98 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4649,6 +4649,7 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) if (block->isLoopAlign()) { + loopAlignCandidates++; succBlock->bbFlags |= BBF_LOOP_ALIGN; JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum, succBlock->bbNum, block->bbNatLoopNum); @@ -4801,6 +4802,9 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) block->bbFlags |= BBF_REMOVED; } + // If this was marked for alignment, remove it + block->unmarkLoopAlign(this DEBUG_ARG("Removed block")); + if (bPrev != nullptr) { switch (bPrev->bbJumpKind) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 658ba2a69186aa..82a70f01e7a007 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -67,6 +67,11 @@ CONFIG_INTEGER(JitAlignLoopAdaptive, W("JitAlignLoopAdaptive"), 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. +CONFIG_INTEGER(JitHideAlignBehindJmp, + W("JitHideAlignBehindJmp"), + 1) // If set, try to hide align instruction (if any) behind an unconditional jump instruction (if any) + // that is present before the loop start. + // Print the alignment boundaries in disassembly. CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4c18c7f9188222..3840aec9208268 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -15320,11 +15320,8 @@ bool Compiler::fgFoldConditional(BasicBlock* block) * Remove the loop from the table */ optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED; -#if FEATURE_LOOP_ALIGN - optLoopTable[loopNum].lpTop->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", - optLoopTable[loopNum].lpTop->bbNum); -#endif + + optLoopTable[loopNum].lpTop->unmarkLoopAlign(this DEBUG_ARG("Bogus loop")); #ifdef DEBUG if (verbose) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index e3f0218bd7ba8b..7378561146d561 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -20,16 +20,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void Compiler::optInit() { - optLoopsMarked = false; - fgHasLoops = false; + optLoopsMarked = false; + fgHasLoops = false; + loopAlignCandidates = 0; /* Initialize the # of tracked loops to 0 */ optLoopCount = 0; optLoopTable = nullptr; #ifdef DEBUG - loopAlignCandidates = 0; - loopsAligned = 0; + loopsAligned = 0; #endif /* Keep track of the number of calls and indirect calls made by this method */ @@ -379,14 +379,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) JITDUMP("\n"); -#if FEATURE_LOOP_ALIGN - if (begBlk->isLoopAlign()) - { - // Clear the loop alignment bit on the head of a loop, since it's no longer a loop. - begBlk->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from removed loop in " FMT_BB "\n", begBlk->bbNum); - } -#endif + begBlk->unmarkLoopAlign(this DEBUG_ARG("Removed loop")); } /***************************************************************************************************** @@ -2516,9 +2509,15 @@ void Compiler::optIdentifyLoopsForAlignment() weight_t topWeight = top->getBBWeight(this); if (topWeight >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) { - top->bbFlags |= BBF_LOOP_ALIGN; - JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, - top->bbNum, topWeight); + // Sometimes with JitOptRepeat > 1, we might end up finding the loops twice. In such + // cases, make sure to count them just once. + if (!top->isLoopAlign()) + { + loopAlignCandidates++; + top->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, + top->bbNum, top->getBBWeight(this)); + } } else { @@ -3779,11 +3778,7 @@ PhaseStatus Compiler::optUnrollLoops() #if FEATURE_LOOP_ALIGN for (block = head->bbNext;; block = block->bbNext) { - if (block->isLoopAlign()) - { - block->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum); - } + block->unmarkLoopAlign(this DEBUG_ARG("Unrolled loop")); if (block == bottom) { @@ -7617,9 +7612,8 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) { BasicBlock* top = optLoopTable[lnum].lpTop; - top->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag for " FMT_LP " that starts at " FMT_BB " because loop has a call.\n", lnum, - top->bbNum); + + top->unmarkLoopAlign(this DEBUG_ARG("Loop with call")); } #endif