Skip to content

Commit

Permalink
Hide 'align' instruction behind jmp (#60787)
Browse files Browse the repository at this point in the history
* Hide align behind a jmp

fix the alignBytesRemoved

Some fixes and working model

Some fixes and redesign

Some more fixes

more fixes

fix

Add the check  for fgFirstBB

misc changes

code cleanup + JitHideAlignBehindJmp switch

validatePadding only if align are before the loop IG

More cleanup, remove commented code

jit format

* Fix a problem where curIG==0 and loop might be emitted in curIG, adjust the targetIG to prevIG

Add IGF_REMOVED_ALIGN flag for special scenarios

* Add stress mode to emit int3 for xarch

* Add stress mode to emit bkpt for arm64

* Add a loop align instruction placement phase

* review comments

* Change from unsigned short to unsigned

* review comments around cleanup

* emitForceNewIG

* Remove emitPrevIG

* Revert change to forceNewIG for align instruction

* Use loopAlignCandidates

* Use loopHeadIG reference

* jit format

* Remove unneeded method

* Misc changes

* Review feedback

* Do not include align behind Jmp in PerfScore calculation

* jit format and fix a bug

* fix the loopCandidates == 0 scenario

* Add unmarkLoopAlign(), add check for fgFirstBB

* merge conflict fix

* Add missing }

* Grammar nit

Co-authored-by: Bruce Forstall <brucefo@microsoft.com>

Co-authored-by: Bruce Forstall <brucefo@microsoft.com>
  • Loading branch information
kunalspathak and BruceForstall authored Nov 18, 2021
1 parent 7f874ee commit 581d4d2
Show file tree
Hide file tree
Showing 14 changed files with 490 additions and 175 deletions.
19 changes: 19 additions & 0 deletions src/coreclr/jit/block.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1662,3 +1662,22 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other)
bbsDstTab[i] = other->bbsDstTab[i];
}
}

//------------------------------------------------------------------------
// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the
// loop alignment count.
//
// Arguments:
// compiler - Compiler instance
// reason - Reason to print in JITDUMP
//
void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason))
{
// Make sure we unmark and count just once.
if (isLoopAlign())
{
compiler->loopAlignCandidates--;
bbFlags &= ~BBF_LOOP_ALIGN;
JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.", bbNum, reason);
}
}
9 changes: 9 additions & 0 deletions src/coreclr/jit/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ enum BasicBlockFlags : unsigned __int64
BBF_PATCHPOINT = MAKE_BBFLAG(36), // Block is a patchpoint
BBF_HAS_CLASS_PROFILE = MAKE_BBFLAG(37), // BB contains a call needing a class profile
BBF_PARTIAL_COMPILATION_PATCHPOINT = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint
BBF_HAS_ALIGN = MAKE_BBFLAG(39), // BB ends with 'align' instruction

// The following are sets of flags.

Expand Down Expand Up @@ -653,11 +654,19 @@ struct BasicBlock : private LIR::Range
{
return ((bbFlags & BBF_LOOP_HEAD) != 0);
}

bool isLoopAlign() const
{
return ((bbFlags & BBF_LOOP_ALIGN) != 0);
}

void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason));

bool hasAlign() const
{
return ((bbFlags & BBF_HAS_ALIGN) != 0);
}

#ifdef DEBUG
void dspFlags(); // Print the flags
unsigned dspCheapPreds(); // Print the predecessors (bbCheapPreds)
Expand Down
31 changes: 20 additions & 11 deletions src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ void CodeGen::genCodeForBBlist()

for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
{

#ifdef DEBUG
if (compiler->verbose)
{
Expand Down Expand Up @@ -782,21 +783,29 @@ void CodeGen::genCodeForBBlist()
}

#if FEATURE_LOOP_ALIGN
if (block->hasAlign())
{
// If this block has 'align' instruction in the end (identified by BBF_HAS_ALIGN),
// then need to add align instruction in the current "block".
//
// For non-adaptive alignment, add alignment instruction of size depending on the
// compJitAlignLoopBoundary.
// For adaptive alignment, alignment instruction will always be of 15 bytes for xarch
// and 16 bytes for arm64.
assert(ShouldAlignLoops());

// If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
// then need to add align instruction in current "block". Also mark the
// corresponding IG with IGF_LOOP_ALIGN to know that there will be align
// instructions at the end of that IG.
//
// For non-adaptive alignment, add alignment instruction of size depending on the
// compJitAlignLoopBoundary.
// For adaptive alignment, alignment instruction will always be of 15 bytes.
GetEmitter()->emitLoopAlignment(DEBUG_ARG1(block->bbJumpKind == BBJ_ALWAYS));
}

if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
{
assert(ShouldAlignLoops());

GetEmitter()->emitLoopAlignment();
if (compiler->opts.compJitHideAlignBehindJmp)
{
// The current IG is the one that is just before the IG having loop start.
// Establish a connection of recent align instruction emitted to the loop
// it actually is aligning using 'idaLoopHeadPredIG'.
GetEmitter()->emitConnectAlignInstrWithCurIG();
}
}
#endif

Expand Down
83 changes: 83 additions & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2548,11 +2548,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags)

opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1;
opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
opts.compJitHideAlignBehindJmp = JitConfig.JitHideAlignBehindJmp() == 1;
#else
opts.compJitAlignLoopAdaptive = true;
opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
opts.compJitAlignLoopMaxCodeSize = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN;
opts.compJitHideAlignBehindJmp = true;
#endif

#ifdef TARGET_XARCH
Expand Down Expand Up @@ -5153,6 +5155,11 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
fgDebugCheckLinks();
#endif

#if FEATURE_LOOP_ALIGN
// Place loop alignment instructions
DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::placeLoopAlignInstructions);
#endif

// Generate code
codeGen->genGenerateCode(methodCodePtr, methodCodeSize);

Expand Down Expand Up @@ -5209,6 +5216,82 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
#endif // FUNC_INFO_LOGGING
}

#if FEATURE_LOOP_ALIGN

//------------------------------------------------------------------------
// placeLoopAlignInstructions: Iterate over all the blocks and determine
// the best position to place the 'align' instruction. Inserting 'align'
// instructions after an unconditional branch is preferred over inserting
// in the block before the loop. In case there are multiple blocks
// having 'jmp', the one that has lower weight is preferred.
// If the block having 'jmp' is hotter than the block before the loop,
// the align will still be placed after 'jmp' because the processor should
// be smart enough to not fetch extra instruction beyond jmp.
//
void Compiler::placeLoopAlignInstructions()
{
if (loopAlignCandidates == 0)
{
return;
}

int loopsToProcess = loopAlignCandidates;

// Add align only if there were any loops that needed alignment
weight_t minBlockSoFar = BB_MAX_WEIGHT;
BasicBlock* bbHavingAlign = nullptr;
for (BasicBlock* const block : Blocks())
{
if ((block == fgFirstBB) && block->isLoopAlign())
{
// Adding align instruction in prolog is not supported
// hence skip the align block if it is the first block.
loopsToProcess--;
continue;
}

// If there is a unconditional jump
if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS))
{
if (block->bbWeight < minBlockSoFar)
{
minBlockSoFar = block->bbWeight;
bbHavingAlign = block;
JITDUMP(FMT_BB ", bbWeight=" FMT_WT " ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight);
}
}

if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
{
// If jmp was not found, then block before the loop start is where align instruction will be added.
if (bbHavingAlign == nullptr)
{
bbHavingAlign = block;
JITDUMP("Marking " FMT_BB " before the loop with BBF_HAS_ALIGN for loop at " FMT_BB "\n", block->bbNum,
block->bbNext->bbNum);
}
else
{
JITDUMP("Marking " FMT_BB " that ends with unconditional jump with BBF_HAS_ALIGN for loop at " FMT_BB
"\n",
bbHavingAlign->bbNum, block->bbNext->bbNum);
}

bbHavingAlign->bbFlags |= BBF_HAS_ALIGN;
minBlockSoFar = BB_MAX_WEIGHT;
bbHavingAlign = nullptr;

if (--loopsToProcess == 0)
{
break;
}
}
}

assert(loopsToProcess == 0);
}
#endif

//------------------------------------------------------------------------
// generatePatchpointInfo: allocate and fill in patchpoint info data,
// and report it to the VM
Expand Down
14 changes: 9 additions & 5 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3666,6 +3666,7 @@ class Compiler
#endif

BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind);
void placeLoopAlignInstructions();

/*
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Expand Down Expand Up @@ -6871,13 +6872,13 @@ class Compiler
bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability

public:
LoopDsc* optLoopTable; // loop descriptor table
unsigned char optLoopCount; // number of tracked loops
LoopDsc* optLoopTable; // loop descriptor table
unsigned char optLoopCount; // number of tracked loops
unsigned char loopAlignCandidates; // number of loops identified for alignment

#ifdef DEBUG
unsigned char loopAlignCandidates; // number of loops identified for alignment
unsigned char loopsAligned; // number of loops actually aligned
#endif // DEBUG
unsigned char loopsAligned; // number of loops actually aligned
#endif // DEBUG

bool optRecordLoop(BasicBlock* head,
BasicBlock* top,
Expand Down Expand Up @@ -9688,6 +9689,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// If set, perform adaptive loop alignment that limits number of padding based on loop size.
bool compJitAlignLoopAdaptive;

// If set, tries to hide alignment instructions behind unconditional jumps.
bool compJitHideAlignBehindJmp;

#ifdef LATE_DISASM
bool doLateDisasm; // Run the late disassembler
#endif // LATE_DISASM
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/compphases.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ CompPhaseNameMacro(PHASE_INSERT_GC_POLLS, "Insert GC Polls",
CompPhaseNameMacro(PHASE_DETERMINE_FIRST_COLD_BLOCK, "Determine first cold block", "COLD-BLK", false, -1, true)
CompPhaseNameMacro(PHASE_RATIONALIZE, "Rationalize IR", "RAT", false, -1, false)
CompPhaseNameMacro(PHASE_SIMPLE_LOWERING, "Do 'simple' lowering", "SMP-LWR", false, -1, false)
CompPhaseNameMacro(PHASE_ALIGN_LOOPS, "Place 'align' instructions", "LOOP-ALIGN", false, -1, false)

CompPhaseNameMacro(PHASE_LCLVARLIVENESS, "Local var liveness", "LIVENESS", true, -1, false)
CompPhaseNameMacro(PHASE_LCLVARLIVENESS_INIT, "Local var liveness init", "LIV-INIT", false, PHASE_LCLVARLIVENESS, false)
Expand Down
Loading

0 comments on commit 581d4d2

Please sign in to comment.