From 16c48d6a45d4d706c327fc840e1a5d450fb189ae Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 11 Jan 2021 20:54:18 -0800 Subject: [PATCH] Align inner loops (#44370) * Detect inner loop and add 10 bytes of padding at the beginning * generate nop in previous blocks * TODO: figure out if anything needs to be done in optCanonicalizeLoop * Add COMPlus_JitAlignLoopMinBlockWeight and COMPlus_JitAlignLoopMaxCodeSize - Add 2 variables to control which loops get aligned - Moved padding after the conditional/unconditional jump of previous block * Reuse AlignLoops flag for dynamic loop alignment * Detect back edge and count no. of instructions before doing loop alignment * fix bugs * propagate the basic block flag * Switch from instrCount to codeSize * JitAlignLoopWith32BPadding * Add emitLoopAlign32Bytes() * wip * Add logic to avoid emitting nop if not needed * fix a condition * Several things: - Replaced JitAlignLoopWith32BPadding with JitAlignLoopBoundary - Added JitAlignLoopForJcc - Added logging of boundary and point where instruction splitting happpens - Add logic to take into consideration JCC. * Added JitAlignLoopAdaptive algorithm * wip * revert emitarm64.cpp changes * fix errors during merge * fix build errors * refactoring and cleanup * refactoring and build errors fix * jit format * one more build error * Add emitLoopAlignAdjustments() * Update emitLoopAlignAdjustments to just include loopSize calc * Remove #ifdef ADAPTIVE_LOOP_ALIGNMENT * Code cleanup * minor fixes * Fix issues: - Make sure all `align` instructions for non-adaptive fall under same IG - Convert some variables to `unsigned short` - Fixed the maxPadding amount for adaptive alignment calculation * Other fixes * Remove align_loops flag from coreclr * Review feedback - Do not align loop if it has call - Created `emitSetLoopBackEdge()` to isolate `emitCurIG` inside emitter class - Created `emitOutputAlign()` to move the align instruction output logic - Renamed emitVariableeLoopAlign() to emitLongLoopAlign() - Created `optIdentifyLoopsForAlignment()` to identify loops that need alignment - Added comments at various places * jit format * Add FEATURE_LOOP_ALIGN * remove special case for align * Do not propagate BBF_LOOP_ALIGN in certain cases * Introduce instrDescAlign and emitLastAlignedIgNum * Several changes: - Perform accurate padding size before outputting align instruction - During outputting, just double check if the padding needed matches to what was calculated. - If at any time, instruction sizes are over-estimated before the last align instruction, then compensate them by adding NOP. - As part of above step, do not perform encoding "VEX prefix shortening" if there is align instruction in future. - Fix edge cases where because of loop cloning or resolution phase of register allocator, the loops are marked such that they cover the loops that are already mark for alignment. Fix by resetting their IGF_LOOP_ALIGN flag. - During loop size calculation, if the last IG also has `align` flag, then do not take into account the align instruction's size because they are reserved for the next loop. * jit format * fix issue related to needLabel * align memory correctly in superpmi * Few more fixes: - emitOffsAdj takes into account for any mis-prediction of jump. If we compensate that mis-prediction, that off that adjustment. - Record the lastAlignIG only for valid non-zero align instructions * minor JITDUMP messages * Review comments * missing check * Mark the last align IG the one that has non-zero padding * More review comments * Propagate BBF_LOOP_ALIGN for compacting blocks * Handle ALIGN_LOOP flag for loops that are unrolled * jit format * Loop size upto last back-edge instead of first back-edge * Take loop weight in consideration * remove align flag if loop is no longer valid * Adjust loop block weight to 4 instead of 8 * missing space after rebase * fix the enum values after rebase * review feedback * Add missing #ifdef DEBUG --- .../ToolBox/superpmi/superpmi/icorjitinfo.cpp | 16 +- src/coreclr/inc/clrconfigvalues.h | 1 - src/coreclr/inc/corjitflags.h | 50 +- src/coreclr/inc/jiteeversionguid.h | 10 +- src/coreclr/jit/block.cpp | 4 + src/coreclr/jit/block.h | 5 + src/coreclr/jit/codegencommon.cpp | 14 +- src/coreclr/jit/codegenlinear.cpp | 61 +- src/coreclr/jit/compiler.cpp | 39 +- src/coreclr/jit/compiler.h | 39 ++ src/coreclr/jit/emit.cpp | 588 +++++++++++++++++- src/coreclr/jit/emit.h | 58 +- src/coreclr/jit/emitxarch.cpp | 200 +++++- src/coreclr/jit/emitxarch.h | 5 +- src/coreclr/jit/flowgraph.cpp | 26 +- src/coreclr/jit/jit.h | 4 + src/coreclr/jit/jitconfigvalues.h | 27 + src/coreclr/jit/jitee.h | 51 +- src/coreclr/jit/morph.cpp | 6 + src/coreclr/jit/optimizer.cpp | 76 ++- .../tools/Common/JitInterface/CorInfoTypes.cs | 6 +- src/coreclr/vm/eeconfig.cpp | 2 - src/coreclr/vm/eeconfig.h | 2 - src/coreclr/vm/jitinterface.cpp | 2 - 24 files changed, 1157 insertions(+), 135 deletions(-) diff --git a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp index 78b0a46426954..691f9973ce262 100644 --- a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp +++ b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp @@ -1609,7 +1609,21 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */ jitInstance->mc->cr->AddCall("allocMem"); // TODO-Cleanup: Could hot block size be ever 0? - *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize); + size_t codeAlignment = sizeof(void*); + size_t hotCodeAlignedSize = static_cast(hotCodeSize); + + if ((flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + { + codeAlignment = 32; + } + else if ((flag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0) + { + codeAlignment = 16; + } + hotCodeAlignedSize = ALIGN_UP_SPMI(hotCodeAlignedSize, codeAlignment); + hotCodeAlignedSize = hotCodeAlignedSize + (codeAlignment - sizeof(void*)); + *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeAlignedSize); + *hotCodeBlock = ALIGN_UP_SPMI(*hotCodeBlock, codeAlignment); if (coldCodeSize > 0) *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize); diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index fb0d859f8db8b..6ddd274ac9fce 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -302,7 +302,6 @@ RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_UseIBCFile, W("UseIBCFile"), 0, "", CLRConf /// /// JIT /// -RETAIL_CONFIG_DWORD_INFO_DIRECT_ACCESS(UNSUPPORTED_JitAlignLoops, W("JitAlignLoops"), "Aligns loop targets to 8 byte boundaries") CONFIG_DWORD_INFO_EX(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "", CLRConfig::EEConfig_default) CONFIG_DWORD_INFO_DIRECT_ACCESS(INTERNAL_JitDebuggable, W("JitDebuggable"), "") #if !defined(DEBUG) && !defined(_DEBUG) diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h index 83cbc20be8863..5cea8a224c609 100644 --- a/src/coreclr/inc/corjitflags.h +++ b/src/coreclr/inc/corjitflags.h @@ -79,45 +79,45 @@ class CORJIT_FLAGS CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame - CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + CORJIT_FLAG_UNUSED12 = 32, CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - CORJIT_FLAG_UNUSED12 = 34, + CORJIT_FLAG_UNUSED13 = 34, CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - CORJIT_FLAG_UNUSED13 = 38, + CORJIT_FLAG_UNUSED14 = 38, CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code #if defined(TARGET_ARM) CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records #else // !defined(TARGET_ARM) - CORJIT_FLAG_UNUSED14 = 41, + CORJIT_FLAG_UNUSED15 = 41, #endif // !defined(TARGET_ARM) CORJIT_FLAG_NO_INLINING = 42, // JIT should not inline any called method into this method - CORJIT_FLAG_UNUSED15 = 43, - CORJIT_FLAG_UNUSED16 = 44, - CORJIT_FLAG_UNUSED17 = 45, - CORJIT_FLAG_UNUSED18 = 46, - CORJIT_FLAG_UNUSED19 = 47, - CORJIT_FLAG_UNUSED20 = 48, - CORJIT_FLAG_UNUSED21 = 49, - CORJIT_FLAG_UNUSED22 = 50, - CORJIT_FLAG_UNUSED23 = 51, - CORJIT_FLAG_UNUSED24 = 52, - CORJIT_FLAG_UNUSED25 = 53, - CORJIT_FLAG_UNUSED26 = 54, - CORJIT_FLAG_UNUSED27 = 55, - CORJIT_FLAG_UNUSED28 = 56, - CORJIT_FLAG_UNUSED29 = 57, - CORJIT_FLAG_UNUSED30 = 58, - CORJIT_FLAG_UNUSED31 = 59, - CORJIT_FLAG_UNUSED32 = 60, - CORJIT_FLAG_UNUSED33 = 61, - CORJIT_FLAG_UNUSED34 = 62, - CORJIT_FLAG_UNUSED35 = 63 + CORJIT_FLAG_UNUSED16 = 43, + CORJIT_FLAG_UNUSED17 = 44, + CORJIT_FLAG_UNUSED18 = 45, + CORJIT_FLAG_UNUSED19 = 46, + CORJIT_FLAG_UNUSED20 = 47, + CORJIT_FLAG_UNUSED21 = 48, + CORJIT_FLAG_UNUSED22 = 49, + CORJIT_FLAG_UNUSED23 = 50, + CORJIT_FLAG_UNUSED24 = 51, + CORJIT_FLAG_UNUSED25 = 52, + CORJIT_FLAG_UNUSED26 = 53, + CORJIT_FLAG_UNUSED27 = 54, + CORJIT_FLAG_UNUSED28 = 55, + CORJIT_FLAG_UNUSED29 = 56, + CORJIT_FLAG_UNUSED30 = 57, + CORJIT_FLAG_UNUSED31 = 58, + CORJIT_FLAG_UNUSED32 = 59, + CORJIT_FLAG_UNUSED33 = 60, + CORJIT_FLAG_UNUSED34 = 61, + CORJIT_FLAG_UNUSED35 = 62, + CORJIT_FLAG_UNUSED36 = 63 }; CORJIT_FLAGS() diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 6ee29b5a00fae..e67969b5222d5 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -31,11 +31,11 @@ // ////////////////////////////////////////////////////////////////////////////////////////////////////////// -constexpr GUID JITEEVersionIdentifier = { /* 8e32c24d-62fe-4d78-ae73-eedddb928ee2 */ - 0x8e32c24d, - 0x62fe, - 0x4d78, - {0xae, 0x73, 0xee, 0xdd, 0xdb, 0x92, 0x8e, 0xe2} +constexpr GUID JITEEVersionIdentifier = { /* de81f48e-7701-45f2-a91b-1914f88dfd11 */ + 0xde81f48e, + 0x7701, + 0x45f2, + {0xa9, 0x1b, 0x19, 0x14, 0xf8, 0x8d, 0xfd, 0x11} }; ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index f2b14599335f5..6cea8dd2c367a 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -505,6 +505,10 @@ void BasicBlock::dspFlags() { printf("cfe "); } + if (bbFlags & BBF_LOOP_ALIGN) + { + printf("align "); + } } /***************************************************************************** diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 02c37361e831c..d92f5b2c3550c 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -448,6 +448,7 @@ struct BasicBlock : private LIR::Range #define BBF_PATCHPOINT MAKE_BBFLAG(36) // Block is a patchpoint #define BBF_HAS_CLASS_PROFILE MAKE_BBFLAG(37) // BB contains a call needing a class profile +#define BBF_LOOP_ALIGN MAKE_BBFLAG(39) // Block is lexically the first block in a loop we intend to align. // clang-format on @@ -463,6 +464,10 @@ struct BasicBlock : private LIR::Range { return ((bbFlags & BBF_LOOP_HEAD) != 0); } + bool isLoopAlign() const + { + return ((bbFlags & BBF_LOOP_ALIGN) != 0); + } // Flags to update when two blocks are compacted diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index d6eebc9d41615..8c4572dcec43f 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -2258,6 +2258,12 @@ void CodeGen::genGenerateMachineCode() GetEmitter()->emitJumpDistBind(); +#if FEATURE_LOOP_ALIGN + /* Perform alignment adjustments */ + + GetEmitter()->emitLoopAlignAdjustments(); +#endif + /* The code is now complete and final; it should not change after this. */ } @@ -2345,10 +2351,12 @@ void CodeGen::genEmitMachineCode() #ifdef DEBUG if (compiler->opts.disAsm || verbose) { - printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d (MethodHash=%08x) for " + printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for " + "code %d (MethodHash=%08x) for " "method %s\n", - codeSize, prologSize, compiler->info.compPerfScore, instrCount, compiler->info.compMethodHash(), - compiler->info.compFullName); + codeSize, prologSize, compiler->info.compPerfScore, instrCount, + GetEmitter()->emitTotalHotCodeSize + GetEmitter()->emitTotalColdCodeSize, + compiler->info.compMethodHash(), compiler->info.compFullName); printf("; ============================================================\n\n"); printf(""); // in our logic this causes a flush } diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index bf8d1ce087adf..215e3c04f75b5 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -311,13 +311,6 @@ void CodeGen::genCodeForBBlist() genUpdateCurrentFunclet(block); -#ifdef TARGET_XARCH - if (ShouldAlignLoops() && block->bbFlags & BBF_LOOP_HEAD) - { - GetEmitter()->emitLoopAlign(); - } -#endif - genLogLabel(block); // Tell everyone which basic block we're working on @@ -356,6 +349,14 @@ void CodeGen::genCodeForBBlist() needLabel = true; } +#if FEATURE_LOOP_ALIGN + if (GetEmitter()->emitEndsWithAlignInstr()) + { + // we had better be planning on starting a new IG + assert(needLabel); + } +#endif + if (needLabel) { // Mark a label and update the current set of live GC refs @@ -667,10 +668,6 @@ void CodeGen::genCodeForBBlist() switch (block->bbJumpKind) { - case BBJ_ALWAYS: - inst_JMP(EJ_jmp, block->bbJumpDest); - break; - case BBJ_RETURN: genExitCode(block); break; @@ -741,15 +738,55 @@ void CodeGen::genCodeForBBlist() #endif // !FEATURE_EH_FUNCLETS case BBJ_NONE: - case BBJ_COND: case BBJ_SWITCH: break; + case BBJ_ALWAYS: + inst_JMP(EJ_jmp, block->bbJumpDest); + FALLTHROUGH; + + case BBJ_COND: + +#if FEATURE_LOOP_ALIGN + // This is the last place where we operate on blocks and after this, we operate + // on IG. Hence, if we know that the destination of "block" is the first block + // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents + // end of the loop. Propagate that information on the IG through "igLoopBackEdge". + // + // During emitter, this information will be used to calculate the loop size. + // Depending on the loop size, decision of whether to align a loop or not will be taken. + + if (block->bbJumpDest->isLoopAlign()) + { + GetEmitter()->emitSetLoopBackEdge(block->bbJumpDest); + } +#endif + break; + default: noway_assert(!"Unexpected bbJumpKind"); break; } +#if FEATURE_LOOP_ALIGN + + // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN), + // then need to add align instruction in current "block". Also mark the + // corresponding IG with IGF_LOOP_ALIGN to know that there will be align + // instructions at the end of that IG. + // + // For non-adaptive alignment, add alignment instruction of size depending on the + // compJitAlignLoopBoundary. + // For adaptive alignment, alignment instruction will always be of 15 bytes. + + if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) + { + assert(ShouldAlignLoops()); + + GetEmitter()->emitLoopAlignment(); + } +#endif + #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE) if (compiler->verbose) { diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index b25afff5a6fda..7f53629f25496 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2308,7 +2308,7 @@ void Compiler::compSetProcessor() opts.compUseCMOV = jitFlags.IsSet(JitFlags::JIT_FLAG_USE_CMOV); #ifdef DEBUG if (opts.compUseCMOV) - opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50); + opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50); #endif // DEBUG #endif // TARGET_X86 @@ -2615,6 +2615,29 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO); opts.compDbgEnC = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC); +#ifdef DEBUG + opts.compJitAlignLoopAdaptive = JitConfig.JitAlignLoopAdaptive() == 1; + opts.compJitAlignLoopBoundary = (unsigned short)JitConfig.JitAlignLoopBoundary(); + opts.compJitAlignLoopMinBlockWeight = (unsigned short)JitConfig.JitAlignLoopMinBlockWeight(); + + opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1; + opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize(); +#else + opts.compJitAlignLoopAdaptive = true; + opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY; + opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT; +#endif + if (opts.compJitAlignLoopAdaptive) + { + opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1; + } + else + { + opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1; + } + + assert(isPow2(opts.compJitAlignLoopBoundary)); + #if REGEN_SHORTCUTS || REGEN_CALLPAT // We never want to have debugging enabled when regenerating GC encoding patterns opts.compDbgCode = false; @@ -3913,19 +3936,17 @@ void Compiler::compSetOptimizationLevel() codeGen->setFrameRequired(true); #endif - if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_RELOC)) + if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT)) { - codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code - - // The zapper doesn't set JitFlags::JIT_FLAG_ALIGN_LOOPS, and there is - // no reason for it to set it as the JIT doesn't currently support loop alignment - // for prejitted images. (The JIT doesn't know the final address of the code, hence + // The JIT doesn't currently support loop alignment for prejitted images. + // (The JIT doesn't know the final address of the code, hence // it can't align code based on unknown addresses.) - assert(!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS)); + + codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code } else { - codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS)); + codeGen->SetAlignLoops(JitConfig.JitAlignLoops() == 1); } } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1cdebfb9c3c8a..9af31fdf03a07 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6367,6 +6367,8 @@ class Compiler void optFindNaturalLoops(); + void optIdentifyLoopsForAlignment(); + // Ensures that all the loops in the loop nest rooted at "loopInd" (an index into the loop table) are 'canonical' -- // each loop has a unique "top." Returns "true" iff the flowgraph has been modified. bool optCanonicalizeLoopNest(unsigned char loopInd); @@ -9036,6 +9038,43 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool dspGCtbls; // Display the GC tables #endif +// Default numbers used to perform loop alignment. All the numbers are choosen +// based on experimenting with various benchmarks. + +// Default minimum loop block weight required to enable loop alignment. +#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 4 + +// By default a loop will be aligned at 32B address boundary to get better +// performance as per architecture manuals. +#define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20 + +// For non-adaptive loop alignment, by default, only align a loop whose size is +// at most 3 times the alignment block size. If the loop is bigger than that, it is most +// likely complicated enough that loop alignment will not impact performance. +#define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3 + +#ifdef DEBUG + // Loop alignment variables + + // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary. + bool compJitAlignLoopForJcc; +#endif + // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done. + unsigned short compJitAlignLoopMaxCodeSize; + + // Minimum weight needed for the first block of a loop to make it a candidate for alignment. + unsigned short compJitAlignLoopMinBlockWeight; + + // For non-adaptive alignment, address boundary (power of 2) at which loop alignment should + // be done. By default, 32B. + unsigned short compJitAlignLoopBoundary; + + // Padding limit to align a loop. + unsigned short compJitAlignPaddingLimit; + + // If set, perform adaptive loop alignment that limits number of padding based on loop size. + bool compJitAlignLoopAdaptive; + #ifdef LATE_DISASM bool doLateDisasm; // Run the late disassembler #endif // LATE_DISASM diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 3056c71e6a093..b42111611504d 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -160,6 +160,8 @@ unsigned emitter::emitSmallCnsCnt; unsigned emitter::emitLargeCnsCnt; unsigned emitter::emitSmallCns[SMALL_CNS_TSZ]; +unsigned emitter::emitTotalDescAlignCnt; + void emitterStaticStats(FILE* fout) { // insGroup members @@ -387,6 +389,9 @@ void emitterStats(FILE* fout) fprintf(fout, "Total instrDescReloc: %8u (%5.2f%%)\n", emitter::emitTotalIDescRelocCnt, 100.0 * emitter::emitTotalIDescRelocCnt / emitter::emitTotalInsCnt); #endif // TARGET_ARM + fprintf(fout, "Total emitTotalDescAlignCnt: %8u (%5.2f%%)\n", emitter::emitTotalDescAlignCnt, + 100.0 * emitter::emitTotalDescAlignCnt / emitter::emitTotalInsCnt); + fprintf(fout, "\n"); } @@ -636,6 +641,10 @@ void emitter::emitGenIG(insGroup* ig) assert(emitCurIGjmpList == nullptr); +#if FEATURE_LOOP_ALIGN + assert(emitCurIGAlignList == nullptr); +#endif + /* Allocate the temp instruction buffer if we haven't done so */ if (emitCurIGfreeBase == nullptr) @@ -822,6 +831,60 @@ insGroup* emitter::emitSavIG(bool emitAdd) } #endif +#if FEATURE_LOOP_ALIGN + // Did we have any align instructions in this group? + if (emitCurIGAlignList) + { + instrDescAlign* list = nullptr; + instrDescAlign* last = nullptr; + + // Move align instructions to the global list, update their 'next' links + do + { + // Grab the jump and remove it from the list + + instrDescAlign* oa = emitCurIGAlignList; + emitCurIGAlignList = oa->idaNext; + + // Figure out the address of where the align got copied + + size_t of = (BYTE*)oa - emitCurIGfreeBase; + instrDescAlign* na = (instrDescAlign*)(ig->igData + of); + + assert(na->idaIG == ig); + assert(na->idIns() == oa->idIns()); + assert(na->idaNext == oa->idaNext); + assert(na->idIns() == INS_align); + + na->idaNext = list; + list = na; + + if (last == nullptr) + { + last = na; + } + } while (emitCurIGAlignList); + + // Should have at least one align instruction + assert(last); + + if (emitAlignList == nullptr) + { + assert(emitAlignLast == nullptr); + + last->idaNext = emitAlignList; + emitAlignList = list; + } + else + { + last->idaNext = nullptr; + emitAlignLast->idaNext = list; + } + + emitAlignLast = last; + } + +#endif // Did we have any jumps in this group? if (emitCurIGjmpList) @@ -933,6 +996,12 @@ void emitter::emitBegFN(bool hasFramePtr emitCurIGfreeBase = nullptr; emitIGbuffSize = 0; +#if FEATURE_LOOP_ALIGN + emitLastAlignedIgNum = 0; + emitLastInnerLoopStartIgNum = 0; + emitLastInnerLoopEndIgNum = 0; +#endif + /* Record stack frame info (the temp size is just an estimate) */ emitHasFramePtr = hasFramePtr; @@ -968,6 +1037,13 @@ void emitter::emitBegFN(bool hasFramePtr emitNoGCIG = false; emitForceNewIG = false; +#if FEATURE_LOOP_ALIGN + /* We don't have any align instructions */ + + emitAlignList = emitAlignLast = nullptr; + emitCurIGAlignList = nullptr; +#endif + /* We have not recorded any live sets */ assert(VarSetOps::IsEmpty(emitComp, emitThisGCrefVars)); @@ -3613,6 +3689,10 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) { size_t is; +#ifdef DEBUG + size_t beforeAddr = (size_t)*dp; +#endif + /* Record the beginning offset of the instruction */ BYTE* curInsAdr = *dp; @@ -3647,17 +3727,23 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) /* Did the size of the instruction match our expectations? */ - UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr); + UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr); - if (csz != id->idCodeSize()) + unsigned estimatedSize = id->idCodeSize(); + if (actualSize != estimatedSize) { - /* It is fatal to under-estimate the instruction size */ - noway_assert(id->idCodeSize() >= csz); + // It is fatal to under-estimate the instruction size, except for alignment instructions + noway_assert(estimatedSize >= actualSize); + +#if FEATURE_LOOP_ALIGN + // Should never over-estimate align instruction or any instruction before the last align instruction of a method + assert(id->idIns() != INS_align && emitCurIG->igNum > emitLastAlignedIgNum); +#endif #if DEBUG_EMIT if (EMITVERBOSE) { - printf("Instruction predicted size = %u, actual = %u\n", id->idCodeSize(), csz); + printf("Instruction predicted size = %u, actual = %u\n", estimatedSize, actualSize); } #endif // DEBUG_EMIT @@ -3665,7 +3751,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) ig->igFlags |= IGF_UPD_ISZ; #if defined(TARGET_XARCH) - id->idCodeSize(csz); + id->idCodeSize(actualSize); #elif defined(TARGET_ARM) // This is done as part of emitSetShortJump(); // insSize isz = emitInsSize(id->idInsFmt()); @@ -3684,6 +3770,51 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id)); assert(is == emitSizeOfInsDsc(id)); } + + // Print the alignment boundary + if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr) + { + size_t currAddr = (size_t)*dp; + size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1); + + // draw boundary if beforeAddr was before the lastBoundary. + if (beforeAddr < lastBoundaryAddr) + { + printf("; "); + instruction currIns = id->idIns(); + +#if defined(TARGET_XARCH) + + // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + bool isJccAffectedIns = + ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret)); + + instrDesc* nextId = id; + castto(nextId, BYTE*) += is; + instruction nextIns = nextId->idIns(); + if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) || + (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec)) + { + isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align); + } +#else + bool isJccAffectedIns = false; +#endif + + // Indicate if instruction is at at 32B boundary or is splitted + unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1)); + if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0)) + { + printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()), + bytesCrossedBoundary); + } + else + { + printf("..............................."); + } + printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary)); + } + } #endif return is; @@ -4479,6 +4610,428 @@ void emitter::emitJumpDistBind() #endif // DEBUG } +#if FEATURE_LOOP_ALIGN + +//----------------------------------------------------------------------------- +// emitLoopAlignment: Insert an align instruction at the end of emitCurIG and +// mark it as IGF_LOOP_ALIGN to indicate that next IG is a +// loop needing alignment. +// +void emitter::emitLoopAlignment() +{ + if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive)) + { + emitLongLoopAlign(emitComp->opts.compJitAlignLoopBoundary); + } + else + { + emitLoopAlign(); + } + + // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of + // all IGs that follows this IG and participate in a loop. + emitCurIG->igFlags |= IGF_LOOP_ALIGN; + + JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", emitComp->opts.compJitAlignLoopBoundary, + emitComp->compMethodID, emitCurIG->igNum); +} + +//----------------------------------------------------------------------------- +// emitEndsWithAlignInstr: Checks if current IG ends with loop align instruction. +// +// Returns: true if current IG ends with align instruciton. +// +bool emitter::emitEndsWithAlignInstr() +{ + return emitCurIG->isLoopAlign(); +} + +//----------------------------------------------------------------------------- +// getLoopSize: Starting from loopHeaderIg, find the size of the smallest possible loop +// such that it doesn't exceed the maxLoopSize. +// +// Arguments: +// igLoopHeader - The header IG of a loop +// maxLoopSize - Maximum loop size. If the loop is bigger than this value, we will just +// return this value. +// +// Returns: size of a loop in bytes. +// +unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize) +{ + unsigned loopSize = 0; + + for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) + { + loopSize += igInLoop->igSize; + if (igInLoop->isLoopAlign()) + { + // If igInLoop's next IG is a loop and needs alignment, then igInLoop should be the last IG + // of the current loop and should have backedge to current loop header. + assert(igInLoop->igLoopBackEdge == igLoopHeader); + + // In such cases, the current loop size should exclude the align instruction size reserved for + // next loop. + loopSize -= emitComp->opts.compJitAlignPaddingLimit; + } + if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize)) + { + break; + } + } + + return loopSize; +} + +//----------------------------------------------------------------------------- +// emitSetLoopBackEdge : Sets igLoopBackEdge field, if not already set and +// if currIG has back-edge to dstIG. +// +// Notes: +// If the current loop encloses a loop that is already marked as align, then remove +// the alignment flag present on IG before dstIG. +// +void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) +{ + insGroup* dstIG = (insGroup*)loopTopBlock->bbEmitCookie; + + // With (dstIG != nullptr), ensure that only back edges are tracked. + // If there is forward jump, dstIG is not yet generated. + // + // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic + // block numbering is not guaranteed to be sequential. + + if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum)) + { + unsigned currLoopStart = dstIG->igNum; + unsigned currLoopEnd = emitCurIG->igNum; + + // Only mark back-edge if current loop starts after the last inner loop ended. + if (emitLastInnerLoopEndIgNum < currLoopStart) + { + emitCurIG->igLoopBackEdge = dstIG; + + JITDUMP("** IG%02u jumps back to IG%02u forming a loop.\n", currLoopEnd, currLoopStart); + + emitLastInnerLoopStartIgNum = currLoopStart; + emitLastInnerLoopEndIgNum = currLoopEnd; + } + // Otherwise, mark the dstIG->prevIG as no alignment needed. + // + // Note: If current loop's back-edge target is same as emitLastInnerLoopStartIgNum, + // retain the alignment flag of dstIG->prevIG so the loop + // (emitLastInnerLoopStartIgNum ~ emitLastInnerLoopEndIgNum) is still aligned. + else if (emitLastInnerLoopStartIgNum != currLoopStart) + { + // Find the IG before dstIG... + instrDescAlign* alignInstr = emitAlignList; + while ((alignInstr != nullptr) && (alignInstr->idaIG->igNext != dstIG)) + { + alignInstr = alignInstr->idaNext; + } + + // ...and clear the IGF_LOOP_ALIGN flag + if (alignInstr != nullptr) + { + assert(alignInstr->idaIG->igNext == dstIG); + alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; + } + + JITDUMP( + "** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n", + currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum); + } + } +} + +//----------------------------------------------------------------------------- +// emitLoopAlignAdjustments: Walk all the align instructions and update them +// with actual padding needed. + +// Notes: +// For IGs that have align instructions in the end, calculate the actual offset +// of loop start and determine how much padding is needed. Based on that, update +// the igOffs, igSize and emitTotalCodeSize. +// +void emitter::emitLoopAlignAdjustments() +{ + // no align instructions + if (emitAlignList == nullptr) + { + return; + } + + JITDUMP("*************** In emitLoopAlignAdjustments()\n"); + + unsigned short estimatedPaddingNeeded = emitComp->opts.compJitAlignPaddingLimit; + unsigned short alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // For adaptive, adjust the loop size depending on the alignment boundary + int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1; + } + + unsigned alignBytesRemoved = 0; + unsigned loopSize = 0; + unsigned loopIGOffset = 0; + instrDescAlign* alignInstr = emitAlignList; + + for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext) + { + assert(alignInstr->idIns() == INS_align); + + insGroup* alignIG = alignInstr->idaIG; + + loopIGOffset = alignIG->igOffs + alignIG->igSize; + + // igSize also includes INS_align instruction, take it off. + loopIGOffset -= estimatedPaddingNeeded; + + // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected + // that the igLoopBackEdge encloses an IG that is marked for alignment. + unsigned actualPaddingNeeded = + alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0; + + assert(estimatedPaddingNeeded >= actualPaddingNeeded); + + unsigned short diff = (unsigned short)(estimatedPaddingNeeded - actualPaddingNeeded); + + if (diff != 0) + { + alignIG->igSize -= diff; + alignBytesRemoved += diff; + emitTotalCodeSize -= diff; + + // Update the flags + alignIG->igFlags |= IGF_UPD_ISZ; + if (actualPaddingNeeded == 0) + { + alignIG->igFlags &= ~IGF_LOOP_ALIGN; + } + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(actualPaddingNeeded < MAX_ENCODED_SIZE); + alignInstr->idCodeSize(actualPaddingNeeded); + } + else + { + unsigned paddingToAdj = actualPaddingNeeded; + +#ifdef DEBUG + + int instrAdjusted = (alignmentBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; +#endif + // Adjust the padding amount in all align instructions in this IG + instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr; + for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG; + alignInstrToAdj = alignInstrToAdj->idaNext) + { + unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE); + alignInstrToAdj->idCodeSize(newPadding); + paddingToAdj -= newPadding; + prevAlignInstr = alignInstrToAdj; +#ifdef DEBUG + instrAdjusted--; +#endif + } + assert(paddingToAdj == 0); + assert(instrAdjusted == 0); + + // fast forward the align instruction to next IG + alignInstr = prevAlignInstr; + } + + JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum, + estimatedPaddingNeeded, actualPaddingNeeded); + } + + // Adjust the offset of all IGs starting from next IG until we reach the IG having the next + // align instruction or the end of IG list. + insGroup* adjOffIG = alignIG->igNext; + insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; + while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) + { + adjOffIG->igOffs -= alignBytesRemoved; + adjOffIG = adjOffIG->igNext; + } + + if (actualPaddingNeeded > 0) + { + // Record the last IG that has align instruction. No overestimation + // adjustment will be done after emitLastAlignedIgNum. + emitLastAlignedIgNum = alignIG->igNum; + } + } + +#ifdef DEBUG + emitCheckIGoffsets(); +#endif +} + +//----------------------------------------------------------------------------- +// emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the +// end of 'ig' so the loop that starts after 'ig' is aligned. +// +// Returns: Padding amount. +// 0 means no padding is needed, either because loop is already aligned or it +// is too expensive to align loop and hence it will not be aligned. +// +// Notes: +// Below are the steps (in this order) to calculate the padding amount. +// 1. If loop is already aligned to desired boundary, then return 0. // already aligned +// 2. If loop size exceed maximum allowed loop size, then return 0. // already aligned +// +// For adaptive loop alignment: +// 3a. Calculate paddingNeeded and maxPaddingAmount to align to 32B boundary. +// 3b. If paddingNeeded > maxPaddingAmount, then recalculate to align to 16B boundary. +// 3b. If paddingNeeded == 0, then return 0. // already aligned at 16B +// 3c. If paddingNeeded > maxPaddingAmount, then return 0. // expensive to align +// 3d. If the loop already fits in minimum 32B blocks, then return 0. // already best aligned +// 3e. return paddingNeeded. +// +// For non-adaptive loop alignment: +// 3a. Calculate paddingNeeded. +// 3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned +// 3c. return paddingNeeded. +// +unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, + size_t offset DEBUG_ARG(bool displayAlignmentDetails)) +{ + assert(ig->isLoopAlign()); + unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; + + // No padding if loop is already aligned + if ((offset & (alignmentBoundary - 1)) == 0) + { + JITDUMP(";; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary); + return 0; + } + + unsigned maxLoopSize = 0; + int maxLoopBlocksAllowed = 0; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // For adaptive, adjust the loop size depending on the alignment boundary + maxLoopBlocksAllowed = genLog2((unsigned)alignmentBoundary) - 1; + maxLoopSize = alignmentBoundary * maxLoopBlocksAllowed; + } + else + { + // For non-adaptive, just take whatever is supplied using COMPlus_ variables + maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize; + } + + unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize); + + // No padding if loop is big + if (loopSize > maxLoopSize) + { + JITDUMP(";; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize, + maxLoopSize); + return 0; + } + + unsigned paddingToAdd = 0; + unsigned minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary; + bool skipPadding = false; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // adaptive loop alignment + unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1; + unsigned nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1); + + // Check if the alignment exceeds maxPadding limit + if (nPaddingBytes > nMaxPaddingBytes) + { + // Cannot align to 32B, so try to align to 16B boundary. + alignmentBoundary >>= 1; + nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1); + nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1); + + // Check if the loop is already at new alignment boundary + if (nPaddingBytes == 0) + { + skipPadding = true; + JITDUMP(";; Skip alignment: 'Loop already aligned at 16B boundary.'\n"); + } + // Check if the alignment exceeds new maxPadding limit + else if (nPaddingBytes > nMaxPaddingBytes) + { + skipPadding = true; + JITDUMP(";; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " + "AlignmentBoundary= %dB.'\n", + nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); + } + } + + // If within maxPaddingLimit + if (!skipPadding) + { + // Padding is needed only if loop starts at or after the current offset. + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + size_t extraBytesNotInLoop = + (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize; + size_t currentOffset = (size_t)offset % alignmentBoundary; + + if (currentOffset > extraBytesNotInLoop) + { + // Padding is needed only if loop starts at or after the current offset and hence might not + // fit in minBlocksNeededForLoop + paddingToAdd = nPaddingBytes; + } + else + { + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", + minBlocksNeededForLoop, alignmentBoundary); + } + } + } + else + { + // non-adaptive loop alignment + unsigned extraBytesNotInLoop = (alignmentBoundary * minBlocksNeededForLoop) - loopSize; + unsigned currentOffset = (size_t)offset % alignmentBoundary; + +#ifdef DEBUG + // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary + if (emitComp->opts.compJitAlignLoopForJcc) + { + // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing? + currentOffset++; + } +#endif + + if (currentOffset > extraBytesNotInLoop) + { + // Padding is needed only if loop starts at or after the current offset and hence might not + // fit in minBlocksNeededForLoop + paddingToAdd = (-(int)(size_t)offset) & (alignmentBoundary - 1); + } + else + { + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", minBlocksNeededForLoop, + alignmentBoundary); + } + } + + JITDUMP(";; Calculated padding to add %d bytes to align at %dB boundary that starts at 0x%x.'\n", paddingToAdd, + alignmentBoundary, offset); + + // Either no padding is added because it is too expensive or the offset gets aligned + // to the alignment boundary + assert(paddingToAdd == 0 || (((offset + paddingToAdd) & (alignmentBoundary - 1)) == 0)); + + return paddingToAdd; +} + +#endif + void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG) { #ifdef DEBUG @@ -4841,6 +5394,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, (void**)&codeBlock, (void**)&coldCodeBlock, (void**)&consBlock); #endif +#ifdef DEBUG + if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + { + assert(((size_t)codeBlock & 31) == 0); + } +#endif + // if (emitConsDsc.dsdOffs) // printf("Cons=%08X\n", consBlock); @@ -5374,14 +5934,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { printf("\n"); } - - if (emitComp->verbose) - { - printf("Allocated method code size = %4u , actual size = %4u\n", emitTotalCodeSize, cp - codeBlock); - } #endif unsigned actualCodeSize = emitCurCodeOffs(cp); + assert(emitTotalCodeSize >= actualCodeSize); #if EMITTER_STATS totAllocdSize += emitTotalCodeSize; @@ -5391,7 +5947,11 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // Fill in eventual unused space, but do not report this space as used. // If you add this padding during the emitIGlist loop, then it will // emit offsets after the loop with wrong value (for example for GC ref variables). - unsigned unusedSize = emitTotalCodeSize - emitCurCodeOffs(cp); + unsigned unusedSize = emitTotalCodeSize - actualCodeSize; + + JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize, + actualCodeSize, unusedSize); + for (unsigned i = 0; i < unusedSize; ++i) { *cp++ = DEFAULT_CODE_BUFFER_INIT; @@ -7215,6 +7775,10 @@ void emitter::emitInitIG(insGroup* ig) ig->igSize = 0; ig->igGCregs = RBM_NONE; ig->igInsCnt = 0; + +#if FEATURE_LOOP_ALIGN + ig->igLoopBackEdge = nullptr; +#endif } /***************************************************************************** diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 0942a2df4ad93..8030cc4b0fb16 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -251,6 +251,10 @@ struct insGroup unsigned short igFlags; // see IGF_xxx below unsigned short igSize; // # of bytes of code in this group +#if FEATURE_LOOP_ALIGN + insGroup* igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head. +#endif + #define IGF_GC_VARS 0x0001 // new set of live GC ref variables #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers #if defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM) @@ -264,6 +268,8 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. +#define IGF_LOOP_ALIGN 0x0400 // this group contains alignment instruction(s) at the end; the next IG is the + // head of a loop that needs alignment. // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -336,6 +342,11 @@ struct insGroup return *(unsigned*)ptr; } + bool isLoopAlign() + { + return (igFlags & IGF_LOOP_ALIGN) != 0; + } + }; // end of struct insGroup // For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes @@ -561,6 +572,7 @@ class emitter #if defined(TARGET_XARCH) static_assert_no_msg(INS_count <= 1024); instruction _idIns : 10; +#define MAX_ENCODED_SIZE 15 #elif defined(TARGET_ARM64) static_assert_no_msg(INS_count <= 512); instruction _idIns : 9; @@ -1361,6 +1373,14 @@ class emitter // hot to cold and cold to hot jumps) }; +#if FEATURE_LOOP_ALIGN + struct instrDescAlign : instrDesc + { + instrDescAlign* idaNext; // next align in the group/method + insGroup* idaIG; // containing group + }; +#endif + #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT. struct instrDescLbl : instrDescJmp { @@ -1738,6 +1758,21 @@ class emitter instrDescJmp* emitJumpLast; // last of local jumps in method void emitJumpDistBind(); // Bind all the local jumps in method +#if FEATURE_LOOP_ALIGN + instrDescAlign* emitCurIGAlignList; // list of align instructions in current IG + unsigned emitLastInnerLoopStartIgNum; // Start IG of last inner loop + unsigned emitLastInnerLoopEndIgNum; // End IG of last inner loop + unsigned emitLastAlignedIgNum; // last IG that has align instruction + instrDescAlign* emitAlignList; // list of local align instructions in method + instrDescAlign* emitAlignLast; // last align instruction in method + unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size + void emitLoopAlignment(); + bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate + void emitSetLoopBackEdge(BasicBlock* loopTopBlock); + void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments + unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails)); +#endif + void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets bool emitFwdJumps; // forward jumps present? @@ -1903,7 +1938,7 @@ class emitter instrDescJmp* emitAllocInstrJmp() { #if EMITTER_STATS - emitTotalIDescJmpCnt++; + emitTotalDescAlignCnt++; #endif // EMITTER_STATS return (instrDescJmp*)emitAllocAnyInstr(sizeof(instrDescJmp), EA_1BYTE); } @@ -1978,6 +2013,17 @@ class emitter return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr); } +#if FEATURE_LOOP_ALIGN + instrDescAlign* emitAllocInstrAlign() + { +#if EMITTER_STATS + emitTotalIDescJmpCnt++; +#endif // EMITTER_STATS + return (instrDescAlign*)emitAllocAnyInstr(sizeof(instrDescAlign), EA_1BYTE); + } + instrDescAlign* emitNewInstrAlign(); +#endif + instrDesc* emitNewInstrSmall(emitAttr attr); instrDesc* emitNewInstr(emitAttr attr = EA_4BYTE); instrDesc* emitNewInstrSC(emitAttr attr, cnsval_ssize_t cns); @@ -2299,6 +2345,7 @@ class emitter #define SMALL_CNS_TSZ 256 static unsigned emitSmallCns[SMALL_CNS_TSZ]; static unsigned emitLargeCnsCnt; + static unsigned emitTotalDescAlignCnt; static unsigned emitIFcounts[IF_COUNT]; @@ -2501,6 +2548,15 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp() return emitAllocInstrJmp(); } +#if FEATURE_LOOP_ALIGN +inline emitter::instrDescAlign* emitter::emitNewInstrAlign() +{ + instrDescAlign* newInstr = emitAllocInstrAlign(); + newInstr->idIns(INS_align); + return newInstr; +} +#endif + #if !defined(TARGET_ARM64) inline emitter::instrDescLbl* emitter::emitNewInstrLbl() { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e91f0cf6d55c9..b6ca4dd7030a3 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -874,9 +874,16 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c // * W must be unset (0x00 validates bit 7) if ((vexPrefix & 0xFFFF7F80) == 0x00C46100) { - emitOutputByte(dst, 0xC5); - emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F)); - return 2; + // Encoding optimization calculation is not done while estimating the instruction + // size and thus over-predict instruction size by 1 byte. + // If there are IGs that will be aligned, do not optimize encoding so the + // estimated alignment sizes are accurate. + if (emitCurIG->igNum > emitLastAlignedIgNum) + { + emitOutputByte(dst, 0xC5); + emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F)); + return 2; + } } emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF)); @@ -2651,22 +2658,62 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int } } -/***************************************************************************** - * - * The next instruction will be a loop head entry point - * So insert a dummy instruction here to ensure that - * the x86 I-cache alignment rule is followed. - */ - -void emitter::emitLoopAlign() +//----------------------------------------------------------------------------- +// +// The next instruction will be a loop head entry point +// So insert an alignment instruction here to ensure that +// we can properly align the code. +// +void emitter::emitLoopAlign(unsigned short paddingBytes) { /* Insert a pseudo-instruction to ensure that we align the next instruction properly */ - instrDesc* id = emitNewInstrSmall(EA_1BYTE); - id->idIns(INS_align); - id->idCodeSize(15); // We may need to skip up to 15 bytes of code - emitCurIGsize += 15; + assert(paddingBytes <= MAX_ENCODED_SIZE); + paddingBytes = min(paddingBytes, MAX_ENCODED_SIZE); // We may need to skip up to 15 bytes of code + instrDescAlign* id = emitNewInstrAlign(); + id->idCodeSize(paddingBytes); + emitCurIGsize += paddingBytes; + + id->idaIG = emitCurIG; + + /* Append this instruction to this IG's alignment list */ + id->idaNext = emitCurIGAlignList; + emitCurIGAlignList = id; +} + +//----------------------------------------------------------------------------- +// +// The next instruction will be a loop head entry point +// So insert alignment instruction(s) here to ensure that +// we can properly align the code. +// +// This emits more than one `INS_align` instruction depending on the +// alignmentBoundary parameter. +// +void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) +{ + unsigned short nPaddingBytes = alignmentBoundary - 1; + unsigned short nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; + unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign); + unsigned short insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; + unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; + + // Ensure that all align instructions fall in same IG. + if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) + { + emitForceNewIG = true; + } + + /* Insert a pseudo-instruction to ensure that we align + the next instruction properly */ + + while (insAlignCount) + { + emitLoopAlign(); + insAlignCount--; + } + emitLoopAlign(lastInsAlignSize); } /***************************************************************************** @@ -2676,7 +2723,7 @@ void emitter::emitLoopAlign() void emitter::emitIns_Nop(unsigned size) { - assert(size <= 15); + assert(size <= MAX_ENCODED_SIZE); instrDesc* id = emitNewInstr(); id->idIns(INS_nop); @@ -7341,6 +7388,12 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id) switch (idOp) { case ID_OP_NONE: +#if FEATURE_LOOP_ALIGN + if (id->idIns() == INS_align) + { + return sizeof(instrDescAlign); + } +#endif break; case ID_OP_LBL: @@ -9325,6 +9378,49 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes) return dst; } +//-------------------------------------------------------------------- +// emitOutputAlign: Outputs NOP to align the loop +// +// Arguments: +// ig - Current instruction group +// id - align instruction that holds amount of padding (NOPs) to add +// dst - Destination buffer +// +// Return Value: +// None. +// +// Notes: +// Amount of padding needed to align the loop is already calculated. This +// method extracts that information and inserts suitable NOP instructions. +// +BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) +{ + // Candidate for loop alignment + assert(codeGen->ShouldAlignLoops()); + assert(ig->isLoopAlign()); + + unsigned paddingToAdd = id->idCodeSize(); + + // Either things are already aligned or align them here. + assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); + + // Padding amount should not exceed the alignment boundary + assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary); + +#ifdef DEBUG + bool displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose; + unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, displayAlignmentDetails); + + // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(paddingToAdd == paddingNeeded); + } +#endif + + return emitOutputNOP(dst, paddingToAdd); +} + /***************************************************************************** * * Output an instruction involving an address mode. @@ -12398,7 +12494,8 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) #ifdef DEBUG if (emitComp->verbose) { - printf("; NOTE: size of jump [%08X] mis-predicted\n", emitComp->dspPtr(id)); + printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id), + (id->idCodeSize() - JMP_SIZE_SMALL)); } #endif } @@ -12559,10 +12656,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { assert(emitIssuing); - BYTE* dst = *dp; - size_t sz = sizeof(instrDesc); - instruction ins = id->idIns(); - unsigned char callInstrSize = 0; + BYTE* dst = *dp; + size_t sz = sizeof(instrDesc); + instruction ins = id->idIns(); + unsigned char callInstrSize = 0; + int emitOffsAdjBefore = emitOffsAdj; #ifdef DEBUG bool dspOffs = emitComp->opts.dspGCtbls; @@ -12598,9 +12696,21 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // the loop alignment pseudo instruction if (ins == INS_align) { - sz = SMALL_IDSC_SIZE; - dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f); - assert(((size_t)dst & 0x0f) == 0); + sz = sizeof(instrDescAlign); + // IG can be marked as not needing alignment after emitting align instruction + // In such case, skip outputting alignment. + if (ig->isLoopAlign()) + { + dst = emitOutputAlign(ig, id, dst); + } +#ifdef DEBUG + else + { + // If the IG is not marked as need alignment, then the code size + // should be zero i.e. no padding needed. + assert(id->idCodeSize() == 0); + } +#endif break; } @@ -13704,7 +13814,49 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(*dp), *dp, (dst - *dp)); } +#endif +#if FEATURE_LOOP_ALIGN + // Only compensate over-estimated instructions if emitCurIG is before + // the last IG that needs alignment. + if (emitCurIG->igNum <= emitLastAlignedIgNum) + { + int diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp)); + assert(diff >= 0); + if (diff != 0) + { + +#ifdef DEBUG + // should never over-estimate align instruction + assert(id->idIns() != INS_align); + JITDUMP("Added over-estimation compensation: %d\n", diff); + + if (emitComp->opts.disAsm) + { + emitDispInsAddr(dst); + printf("\t\t ;; NOP compensation instructions of %d bytes.\n", diff); + } +#endif + + dst = emitOutputNOP(dst, diff); + + // since we compensated the over-estimation, revert the offsAdj that + // might have happened in the jump + if (emitOffsAdjBefore != emitOffsAdj) + { +#ifdef DEBUG + insFormat format = id->idInsFmt(); + assert((format == IF_LABEL) || (format == IF_RWR_LABEL) || (format == IF_SWR_LABEL)); + assert(diff == (emitOffsAdj - emitOffsAdjBefore)); +#endif + emitOffsAdj -= diff; + } + } + assert((id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp))) == 0); + } +#endif + +#ifdef DEBUG if (emitComp->compDebugBreak) { // set JitEmitPrintRefRegs=1 will print out emitThisGCrefRegs and emitThisByrefRegs diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index fb2aac2d30f0d..b0a8327acedb6 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -50,6 +50,7 @@ UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val); UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code); UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val); +BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst); BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); @@ -287,7 +288,9 @@ inline emitAttr emitDecodeScale(unsigned ensz) /************************************************************************/ public: -void emitLoopAlign(); +void emitLoopAlign(unsigned short paddingBytes = 15); + +void emitLongLoopAlign(unsigned short alignmentBoundary); void emitIns(instruction ins); diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 8eef9ab442fbb..efb3305a6206a 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -9642,9 +9642,9 @@ BasicBlock* Compiler::fgSplitBlockAtEnd(BasicBlock* curr) newBlock->bbFlags = curr->bbFlags; // Remove flags that the new block can't have. - newBlock->bbFlags &= - ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | BBF_JMP_TARGET | - BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET); + newBlock->bbFlags &= ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | + BBF_JMP_TARGET | BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | + BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN); // Remove the GC safe bit on the new block. It seems clear that if we split 'curr' at the end, // such that all the code is left in 'curr', and 'newBlock' just gets the control flow, then @@ -10946,6 +10946,18 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) break; } + // Add the LOOP_ALIGN flag + if (bNext->isLoopAlign()) + { + // Only if the new block is jump target or has label + if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0)) + { + block->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum, + block->bbNum); + } + } + // If we're collapsing a block created after the dominators are // computed, copy block number the block and reuse dominator // information from bNext to block. @@ -11536,6 +11548,14 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) if (block->isLoopHead() && (succBlock->bbNum <= block->bbNum)) { succBlock->bbFlags |= BBF_LOOP_HEAD; + + if (block->isLoopAlign()) + { + succBlock->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum, + succBlock->bbNum, block->bbNatLoopNum); + } + if (fgDomsComputed && fgReachable(succBlock, block)) { /* Mark all the reachable blocks between 'succBlock' and 'block', excluding 'block' */ diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h index 9fb780dbd40c6..62e7ac8059b16 100644 --- a/src/coreclr/jit/jit.h +++ b/src/coreclr/jit/jit.h @@ -747,6 +747,10 @@ class Histogram #define CLFLG_STRUCTPROMOTE 0x00000 #endif +#ifdef TARGET_XARCH +#define FEATURE_LOOP_ALIGN 1 +#endif + #define CLFLG_MAXOPT \ (CLFLG_CSE | CLFLG_REGVAR | CLFLG_RNGCHKOPT | CLFLG_DEADASGN | CLFLG_CODEMOTION | CLFLG_QMARK | CLFLG_TREETRANS | \ CLFLG_INLINING | CLFLG_STRUCTPROMOTE | CLFLG_CONSTANTFOLD) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 865ae3033f09a..5ffab7c0f29e9 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -41,6 +41,27 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb // optimizations are performed on the fast path. CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra // with this byte. +CONFIG_INTEGER(JitAlignLoopMinBlockWeight, + W("JitAlignLoopMinBlockWeight"), + DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT) // Minimum weight needed for the first block of a loop to make it a + // candidate for alignment. +CONFIG_INTEGER(JitAlignLoopMaxCodeSize, + W("JitAlignLoopMaxCodeSize"), + DEFAULT_MAX_LOOPSIZE_FOR_ALIGN) // For non-adaptive alignment, minimum loop size (in bytes) for which + // alignment will be done. + // Defaults to 3 blocks of 32 bytes chunks = 96 bytes. +CONFIG_INTEGER(JitAlignLoopBoundary, + W("JitAlignLoopBoundary"), + DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop + // alignment should be done. By default, 32B. +CONFIG_INTEGER(JitAlignLoopForJcc, + W("JitAlignLoopForJcc"), + 0) // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary. + +CONFIG_INTEGER(JitAlignLoopAdaptive, + W("JitAlignLoopAdaptive"), + 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. + CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0) CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1) CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1) // Uses only ASCII characters in tree dumps @@ -202,6 +223,12 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En // intrinsic classes #endif // defined(DEBUG) +#if FEATURE_LOOP_ALIGN +CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops +#else +CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 0) +#endif + /// /// JIT /// diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h index 298536138b2e1..6301166e489c0 100644 --- a/src/coreclr/jit/jitee.h +++ b/src/coreclr/jit/jitee.h @@ -63,45 +63,45 @@ class JitFlags JIT_FLAG_BBINSTR = 29, // Collect basic block profile information JIT_FLAG_BBOPT = 30, // Optimize method based on profile information JIT_FLAG_FRAMED = 31, // All methods have an EBP frame - JIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + JIT_FLAG_UNUSED12 = 32, JIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - JIT_FLAG_UNUSED12 = 34, + JIT_FLAG_UNUSED13 = 34, JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background JIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions JIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - JIT_FLAG_UNUSED13 = 38, + JIT_FLAG_UNUSED14 = 38, JIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible JIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code #if defined(TARGET_ARM) JIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records #else // !defined(TARGET_ARM) - JIT_FLAG_UNUSED14 = 41, + JIT_FLAG_UNUSED15 = 41, #endif // !defined(TARGET_ARM) JIT_FLAG_NO_INLINING = 42, // JIT should not inline any called method into this method - JIT_FLAG_UNUSED15 = 43, - JIT_FLAG_UNUSED16 = 44, - JIT_FLAG_UNUSED17 = 45, - JIT_FLAG_UNUSED18 = 46, - JIT_FLAG_UNUSED19 = 47, - JIT_FLAG_UNUSED20 = 48, - JIT_FLAG_UNUSED21 = 49, - JIT_FLAG_UNUSED22 = 50, - JIT_FLAG_UNUSED23 = 51, - JIT_FLAG_UNUSED24 = 52, - JIT_FLAG_UNUSED25 = 53, - JIT_FLAG_UNUSED26 = 54, - JIT_FLAG_UNUSED27 = 55, - JIT_FLAG_UNUSED28 = 56, - JIT_FLAG_UNUSED29 = 57, - JIT_FLAG_UNUSED30 = 58, - JIT_FLAG_UNUSED31 = 59, - JIT_FLAG_UNUSED32 = 60, - JIT_FLAG_UNUSED33 = 61, - JIT_FLAG_UNUSED34 = 62, - JIT_FLAG_UNUSED35 = 63 + JIT_FLAG_UNUSED16 = 43, + JIT_FLAG_UNUSED17 = 44, + JIT_FLAG_UNUSED18 = 45, + JIT_FLAG_UNUSED19 = 46, + JIT_FLAG_UNUSED20 = 47, + JIT_FLAG_UNUSED21 = 48, + JIT_FLAG_UNUSED22 = 49, + JIT_FLAG_UNUSED23 = 50, + JIT_FLAG_UNUSED24 = 51, + JIT_FLAG_UNUSED25 = 52, + JIT_FLAG_UNUSED26 = 53, + JIT_FLAG_UNUSED27 = 54, + JIT_FLAG_UNUSED28 = 55, + JIT_FLAG_UNUSED29 = 56, + JIT_FLAG_UNUSED30 = 57, + JIT_FLAG_UNUSED31 = 58, + JIT_FLAG_UNUSED32 = 59, + JIT_FLAG_UNUSED33 = 60, + JIT_FLAG_UNUSED34 = 61, + JIT_FLAG_UNUSED35 = 62, + JIT_FLAG_UNUSED36 = 63 }; // clang-format on @@ -201,7 +201,6 @@ class JitFlags FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR, JIT_FLAG_BBINSTR); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBOPT, JIT_FLAG_BBOPT); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_FRAMED, JIT_FLAG_FRAMED); - FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS, JIT_FLAG_ALIGN_LOOPS); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_PUBLISH_SECRET_PARAM, JIT_FLAG_PUBLISH_SECRET_PARAM); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_SAMPLING_JIT_BACKGROUND, JIT_FLAG_SAMPLING_JIT_BACKGROUND); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_USE_PINVOKE_HELPERS, JIT_FLAG_USE_PINVOKE_HELPERS); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 3d96acbe79851..debbe3b85f01e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -16379,6 +16379,12 @@ bool Compiler::fgFoldConditional(BasicBlock* block) * Remove the loop from the table */ optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED; +#if FEATURE_LOOP_ALIGN + optLoopTable[loopNum].lpFirst->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", + optLoopTable[loopNum].lpFirst->bbNum); +#endif + #ifdef DEBUG if (verbose) { diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index e134915cfe9d3..ddadd938fcfc6 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2578,6 +2578,41 @@ void Compiler::optFindNaturalLoops() #endif // DEBUG } +//----------------------------------------------------------------------------- +// +// All the inner loops that whose block weight meets a threshold are marked +// as needing alignment. +// + +void Compiler::optIdentifyLoopsForAlignment() +{ +#if FEATURE_LOOP_ALIGN + if (codeGen->ShouldAlignLoops()) + { + for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++) + { + BasicBlock* first = optLoopTable[loopInd].lpFirst; + + // An innerloop candidate that might need alignment + if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) + { + if (first->getBBWeight(this) >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) + { + first->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum, + first->getBBWeight(this)); + } + else + { + JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " weight=%f.\n", loopInd, first->bbNum, + first->getBBWeight(this)); + } + } + } + } +#endif +} + void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) { BasicBlock* newJumpDest = nullptr; @@ -3757,6 +3792,22 @@ void Compiler::optUnrollLoops() #endif } +#if FEATURE_LOOP_ALIGN + for (block = head->bbNext;; block = block->bbNext) + { + if (block->isLoopAlign()) + { + block->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum); + } + + if (block == bottom) + { + break; + } + } +#endif + /* Create the unrolled loop statement list */ { BlockToBlockMap blockMap(getAllocator()); @@ -4506,6 +4557,10 @@ void Compiler::optOptimizeLoops() } } + // Check if any of the loops need alignment + + optIdentifyLoopsForAlignment(); + #if COUNT_LOOPS totalUnnatLoopCount += loopNum; #endif @@ -5146,9 +5201,10 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) { assert(loopInd < optLoopCount); - JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d].\n", loopInd, optLoopTable[loopInd].lpHead->bbNum, - optLoopTable[loopInd].lpFirst->bbNum, optLoopTable[loopInd].lpTop->bbNum, - optLoopTable[loopInd].lpEntry->bbNum, optLoopTable[loopInd].lpBottom->bbNum); + JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d, c: %d].\n", loopInd, + optLoopTable[loopInd].lpHead->bbNum, optLoopTable[loopInd].lpFirst->bbNum, + optLoopTable[loopInd].lpTop->bbNum, optLoopTable[loopInd].lpEntry->bbNum, + optLoopTable[loopInd].lpBottom->bbNum, optLoopTable[loopInd].lpChild); // Determine the depth of the loop, so we can properly weight blocks added (outside the cloned loop blocks). unsigned depth = optLoopDepth(loopInd); @@ -7975,6 +8031,20 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk) // Marks the containsCall information to "lnum" and any parent loops. void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) { + +#if FEATURE_LOOP_ALIGN + // If this is the inner most loop, reset the LOOP_ALIGN flag + // because a loop having call will not likely to benefit from + // alignment + if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) + { + BasicBlock* first = optLoopTable[lnum].lpFirst; + first->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum, + first->bbNum); + } +#endif + assert(0 <= lnum && lnum < optLoopCount); while (lnum != BasicBlock::NOT_IN_LOOP) { diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs index 79768f5fbdb9e..1aadd4e266454 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs @@ -1307,13 +1307,13 @@ public enum CorJitFlag : uint CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame - CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + CORJIT_FLAG_UNUSED8 = 32, CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - CORJIT_FLAG_UNUSED8 = 34, + CORJIT_FLAG_UNUSED9 = 34, CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - CORJIT_FLAG_UNUSED9 = 38, + CORJIT_FLAG_UNUSED10 = 38, CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp index 389e4024e8c3b..c1336060d21b7 100644 --- a/src/coreclr/vm/eeconfig.cpp +++ b/src/coreclr/vm/eeconfig.cpp @@ -118,7 +118,6 @@ HRESULT EEConfig::Init() iJitOptimizeType = OPT_DEFAULT; fJitFramed = false; - fJitAlignLoops = false; fJitMinOpts = false; fPInvokeRestoreEsp = (DWORD)-1; @@ -689,7 +688,6 @@ fTrackDynamicMethodDebugInfo = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_ dwJitHostMaxSlabCache = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_JitHostMaxSlabCache); fJitFramed = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitFramed, fJitFramed) != 0); - fJitAlignLoops = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitAlignLoops, fJitAlignLoops) != 0); fJitMinOpts = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JITMinOpts, fJitMinOpts) == 1); iJitOptimizeType = GetConfigDWORD_DontUse_(CLRConfig::EXTERNAL_JitOptimizeType, iJitOptimizeType); if (iJitOptimizeType > OPT_RANDOM) iJitOptimizeType = OPT_DEFAULT; diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h index 46616fa1f5d00..a068e447117e1 100644 --- a/src/coreclr/vm/eeconfig.h +++ b/src/coreclr/vm/eeconfig.h @@ -75,7 +75,6 @@ class EEConfig bool GetTrackDynamicMethodDebugInfo(void) const {LIMITED_METHOD_CONTRACT; return fTrackDynamicMethodDebugInfo; } unsigned int GenOptimizeType(void) const {LIMITED_METHOD_CONTRACT; return iJitOptimizeType; } bool JitFramed(void) const {LIMITED_METHOD_CONTRACT; return fJitFramed; } - bool JitAlignLoops(void) const {LIMITED_METHOD_CONTRACT; return fJitAlignLoops; } bool JitMinOpts(void) const {LIMITED_METHOD_CONTRACT; return fJitMinOpts; } // Tiered Compilation config @@ -537,7 +536,6 @@ class EEConfig DWORD dwJitHostMaxSlabCache; // max size for jit host slab cache bool fTrackDynamicMethodDebugInfo; // Enable/Disable tracking dynamic method debug info bool fJitFramed; // Enable/Disable EBP based frames - bool fJitAlignLoops; // Enable/Disable loop alignment bool fJitMinOpts; // Enable MinOpts for all jitted methods unsigned iJitOptimizeType; // 0=Blended,1=SmallCode,2=FastCode, default is 0=Blended diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 0d60059283a3d..aa60a55ceb3e2 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -12676,8 +12676,6 @@ CorJitResult CallCompileMethodWithSEHWrapper(EEJitManager *jitMgr, CORJIT_FLAGS flags; if (g_pConfig->JitFramed()) flags.Set(CORJIT_FLAGS::CORJIT_FLAG_FRAMED); - if (g_pConfig->JitAlignLoops()) - flags.Set(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS); #ifdef TARGET_X86 if (g_pConfig->PInvokeRestoreEsp(ftn->GetModule()->IsPreV4Assembly())) flags.Set(CORJIT_FLAGS::CORJIT_FLAG_PINVOKE_RESTORE_ESP);