diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp
index 4104bd7d0cab30..62add7f24dd3f9 100644
--- a/src/coreclr/jit/block.cpp
+++ b/src/coreclr/jit/block.cpp
@@ -1662,3 +1662,22 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other)
         bbsDstTab[i] = other->bbsDstTab[i];
     }
 }
+
+//------------------------------------------------------------------------
+// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the
+//                  loop alignment count.
+//
+// Arguments:
+//    compiler - Compiler instance
+//    reason - Reason to print in JITDUMP
+//
+void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason))
+{
+    // Make sure we unmark and count just once.
+    if (isLoopAlign())
+    {
+        compiler->loopAlignCandidates--;
+        bbFlags &= ~BBF_LOOP_ALIGN;
+        JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.", bbNum, reason);
+    }
+}
diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index ec7918d4aac953..3b8ecf7f0dacfe 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -552,6 +552,7 @@ enum BasicBlockFlags : unsigned __int64
     BBF_PATCHPOINT           = MAKE_BBFLAG(36), // Block is a patchpoint
     BBF_HAS_CLASS_PROFILE    = MAKE_BBFLAG(37), // BB contains a call needing a class profile
     BBF_PARTIAL_COMPILATION_PATCHPOINT  = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint
+    BBF_HAS_ALIGN          = MAKE_BBFLAG(39), // BB ends with 'align' instruction
 
     // The following are sets of flags.
 
@@ -653,11 +654,19 @@ struct BasicBlock : private LIR::Range
     {
         return ((bbFlags & BBF_LOOP_HEAD) != 0);
     }
+
     bool isLoopAlign() const
     {
         return ((bbFlags & BBF_LOOP_ALIGN) != 0);
     }
 
+    void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason));
+
+    bool hasAlign() const
+    {
+        return ((bbFlags & BBF_HAS_ALIGN) != 0);
+    }
+
 #ifdef DEBUG
     void     dspFlags();               // Print the flags
     unsigned dspCheapPreds();          // Print the predecessors (bbCheapPreds)
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 840ad2e8c6091e..4f9cfa00752d6b 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -174,6 +174,7 @@ void CodeGen::genCodeForBBlist()
 
     for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
     {
+
 #ifdef DEBUG
         if (compiler->verbose)
         {
@@ -782,21 +783,29 @@ void CodeGen::genCodeForBBlist()
         }
 
 #if FEATURE_LOOP_ALIGN
+        if (block->hasAlign())
+        {
+            // If this block has 'align' instruction in the end (identified by BBF_HAS_ALIGN),
+            // then need to add align instruction in the current "block".
+            //
+            // For non-adaptive alignment, add alignment instruction of size depending on the
+            // compJitAlignLoopBoundary.
+            // For adaptive alignment, alignment instruction will always be of 15 bytes for xarch
+            // and 16 bytes for arm64.
+            assert(ShouldAlignLoops());
 
-        // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
-        // then need to add align instruction in current "block". Also mark the
-        // corresponding IG with IGF_LOOP_ALIGN to know that there will be align
-        // instructions at the end of that IG.
-        //
-        // For non-adaptive alignment, add alignment instruction of size depending on the
-        // compJitAlignLoopBoundary.
-        // For adaptive alignment, alignment instruction will always be of 15 bytes.
+            GetEmitter()->emitLoopAlignment(DEBUG_ARG1(block->bbJumpKind == BBJ_ALWAYS));
+        }
 
         if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
         {
-            assert(ShouldAlignLoops());
-
-            GetEmitter()->emitLoopAlignment();
+            if (compiler->opts.compJitHideAlignBehindJmp)
+            {
+                // The current IG is the one that is just before the IG having loop start.
+                // Establish a connection of recent align instruction emitted to the loop
+                // it actually is aligning using 'idaLoopHeadPredIG'.
+                GetEmitter()->emitConnectAlignInstrWithCurIG();
+            }
         }
 #endif
 
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 6d5745729ecaf0..01ff6587b4c152 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2547,11 +2547,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
     opts.compJitAlignLoopForJcc      = JitConfig.JitAlignLoopForJcc() == 1;
     opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
+    opts.compJitHideAlignBehindJmp   = JitConfig.JitHideAlignBehindJmp() == 1;
 #else
     opts.compJitAlignLoopAdaptive       = true;
     opts.compJitAlignLoopBoundary       = DEFAULT_ALIGN_LOOP_BOUNDARY;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
     opts.compJitAlignLoopMaxCodeSize    = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN;
+    opts.compJitHideAlignBehindJmp      = true;
 #endif
 
 #ifdef TARGET_XARCH
@@ -5152,6 +5154,11 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
     fgDebugCheckLinks();
 #endif
 
+#if FEATURE_LOOP_ALIGN
+    // Place loop alignment instructions
+    DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::placeLoopAlignInstructions);
+#endif
+
     // Generate code
     codeGen->genGenerateCode(methodCodePtr, methodCodeSize);
 
@@ -5208,6 +5215,82 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
 #endif // FUNC_INFO_LOGGING
 }
 
+#if FEATURE_LOOP_ALIGN
+
+//------------------------------------------------------------------------
+// placeLoopAlignInstructions: Iterate over all the blocks and determine
+//      the best position to place the 'align' instruction. Inserting 'align'
+//      instructions after an unconditional branch is preferred over inserting
+//      in the block before the loop. In case there are multiple blocks
+//      having 'jmp', the one that has lower weight is preferred.
+//      If the block having 'jmp' is hotter than the block before the loop,
+//      the align will still be placed after 'jmp' because the processor should
+//      be smart enough to not fetch extra instruction beyond jmp.
+//
+void Compiler::placeLoopAlignInstructions()
+{
+    if (loopAlignCandidates == 0)
+    {
+        return;
+    }
+
+    int loopsToProcess = loopAlignCandidates;
+
+    // Add align only if there were any loops that needed alignment
+    weight_t    minBlockSoFar = BB_MAX_WEIGHT;
+    BasicBlock* bbHavingAlign = nullptr;
+    for (BasicBlock* const block : Blocks())
+    {
+        if ((block == fgFirstBB) && block->isLoopAlign())
+        {
+            // Adding align instruction in prolog is not supported
+            // hence skip the align block if it is the first block.
+            loopsToProcess--;
+            continue;
+        }
+
+        // If there is a unconditional jump
+        if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS))
+        {
+            if (block->bbWeight < minBlockSoFar)
+            {
+                minBlockSoFar = block->bbWeight;
+                bbHavingAlign = block;
+                JITDUMP(FMT_BB ", bbWeight=" FMT_WT " ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight);
+            }
+        }
+
+        if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
+        {
+            // If jmp was not found, then block before the loop start is where align instruction will be added.
+            if (bbHavingAlign == nullptr)
+            {
+                bbHavingAlign = block;
+                JITDUMP("Marking " FMT_BB " before the loop with BBF_HAS_ALIGN for loop at " FMT_BB "\n", block->bbNum,
+                        block->bbNext->bbNum);
+            }
+            else
+            {
+                JITDUMP("Marking " FMT_BB " that ends with unconditional jump with BBF_HAS_ALIGN for loop at " FMT_BB
+                        "\n",
+                        bbHavingAlign->bbNum, block->bbNext->bbNum);
+            }
+
+            bbHavingAlign->bbFlags |= BBF_HAS_ALIGN;
+            minBlockSoFar = BB_MAX_WEIGHT;
+            bbHavingAlign = nullptr;
+
+            if (--loopsToProcess == 0)
+            {
+                break;
+            }
+        }
+    }
+
+    assert(loopsToProcess == 0);
+}
+#endif
+
 //------------------------------------------------------------------------
 // generatePatchpointInfo: allocate and fill in patchpoint info data,
 //    and report it to the VM
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 51e07b137527be..d99d3affdf5d43 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -3666,6 +3666,7 @@ class Compiler
 #endif
 
     BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind);
+    void placeLoopAlignInstructions();
 
     /*
     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
@@ -6871,13 +6872,13 @@ class Compiler
     bool fgHasLoops;        // True if this method has any loops, set in fgComputeReachability
 
 public:
-    LoopDsc*      optLoopTable; // loop descriptor table
-    unsigned char optLoopCount; // number of tracked loops
+    LoopDsc*      optLoopTable;        // loop descriptor table
+    unsigned char optLoopCount;        // number of tracked loops
+    unsigned char loopAlignCandidates; // number of loops identified for alignment
 
 #ifdef DEBUG
-    unsigned char loopAlignCandidates; // number of loops identified for alignment
-    unsigned char loopsAligned;        // number of loops actually aligned
-#endif                                 // DEBUG
+    unsigned char loopsAligned; // number of loops actually aligned
+#endif                          // DEBUG
 
     bool optRecordLoop(BasicBlock*   head,
                        BasicBlock*   top,
@@ -9687,6 +9688,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         // If set, perform adaptive loop alignment that limits number of padding based on loop size.
         bool compJitAlignLoopAdaptive;
 
+        // If set, tries to hide alignment instructions behind unconditional jumps.
+        bool compJitHideAlignBehindJmp;
+
 #ifdef LATE_DISASM
         bool doLateDisasm; // Run the late disassembler
 #endif                     // LATE_DISASM
diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h
index bb992440f8e3d4..b8805fca753529 100644
--- a/src/coreclr/jit/compphases.h
+++ b/src/coreclr/jit/compphases.h
@@ -87,6 +87,7 @@ CompPhaseNameMacro(PHASE_INSERT_GC_POLLS,        "Insert GC Polls",
 CompPhaseNameMacro(PHASE_DETERMINE_FIRST_COLD_BLOCK, "Determine first cold block", "COLD-BLK", false, -1, true)
 CompPhaseNameMacro(PHASE_RATIONALIZE,            "Rationalize IR",                 "RAT",      false, -1, false)
 CompPhaseNameMacro(PHASE_SIMPLE_LOWERING,        "Do 'simple' lowering",           "SMP-LWR",  false, -1, false)
+CompPhaseNameMacro(PHASE_ALIGN_LOOPS,            "Place 'align' instructions",     "LOOP-ALIGN",  false, -1, false)
 
 CompPhaseNameMacro(PHASE_LCLVARLIVENESS,         "Local var liveness",             "LIVENESS", true, -1, false)
 CompPhaseNameMacro(PHASE_LCLVARLIVENESS_INIT,    "Local var liveness init",        "LIV-INIT", false, PHASE_LCLVARLIVENESS, false)
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 54d57dd22b2f7d..f81de138b8bcb6 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -872,7 +872,7 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         // Move align instructions to the global list, update their 'next' links
         do
         {
-            // Grab the jump and remove it from the list
+            // Grab the align and remove it from the list
 
             instrDescAlign* oa = emitCurIGAlignList;
             emitCurIGAlignList = oa->idaNext;
@@ -913,6 +913,14 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         }
 
         emitAlignLast = last;
+
+        // Point to the first instruction of most recent
+        // align instruction(s) added.
+        //
+        // Since emitCurIGAlignList is created in inverse of
+        // program order, the `list` reverses that in forms it
+        // in correct order.
+        emitAlignLastGroup = list;
     }
 
 #endif
@@ -1071,8 +1079,8 @@ void emitter::emitBegFN(bool hasFramePtr
 #if FEATURE_LOOP_ALIGN
     /* We don't have any align instructions */
 
-    emitAlignList = emitAlignLast = nullptr;
-    emitCurIGAlignList            = nullptr;
+    emitAlignList = emitAlignLastGroup = emitAlignLast = nullptr;
+    emitCurIGAlignList                                 = nullptr;
 #endif
 
     /* We have not recorded any live sets */
@@ -1401,7 +1409,7 @@ void* emitter::emitAllocAnyInstr(size_t sz, emitAttr opsz)
     // the prolog/epilog placeholder groups ARE generated in order, and are
     // re-used. But generating additional groups would not work.
     if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && emitCurIGinsCnt && !emitIGisInProlog(emitCurIG) &&
-        !emitIGisInEpilog(emitCurIG) && !emitCurIG->isLoopAlign()
+        !emitIGisInEpilog(emitCurIG) && !emitCurIG->endsWithAlignInstr()
 #if defined(FEATURE_EH_FUNCLETS)
         && !emitIGisInFuncletProlog(emitCurIG) && !emitIGisInFuncletEpilog(emitCurIG)
 #endif // FEATURE_EH_FUNCLETS
@@ -1574,7 +1582,7 @@ void emitter::emitCheckIGoffsets()
     {
         if (tempIG->igOffs != currentOffset)
         {
-            printf("Block #%u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset);
+            printf("IG%02u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset);
             assert(!"bad block offset");
         }
 
@@ -3555,7 +3563,7 @@ void emitter::emitDispIGflags(unsigned flags)
     {
         printf(", extend");
     }
-    if (flags & IGF_LOOP_ALIGN)
+    if (flags & IGF_HAS_ALIGN)
     {
         printf(", align");
     }
@@ -4804,9 +4812,9 @@ void emitter::emitJumpDistBind()
 //  Arguments:
 //       nAlignInstr - Number of align instructions about to be added.
 //
-void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr)
+void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr)
 {
-    unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign);
+    unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign);
 
     // Ensure that all align instructions fall in same IG.
     if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
@@ -4817,11 +4825,15 @@ void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr)
 
 //-----------------------------------------------------------------------------
 //
-//  The next instruction will be a loop head entry point
-//  So insert an alignment instruction here to ensure that
-//  we can properly align the code.
+//  emitLoopAlign: The next instruction will be a loop head entry point
+//                 So insert an alignment instruction of "paddingBytes" to ensure that
+//                 the code is properly aligned.
+//  Arguments:
+//      paddingBytes - Number of padding bytes to insert.
+//      isFirstAlign - For multiple 'align' instructions case, if this is the first
+//                     'align' instruction of that group.
 //
-void emitter::emitLoopAlign(unsigned short paddingBytes)
+void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp))
 {
     // Determine if 'align' instruction about to be generated will
     // fall in current IG or next.
@@ -4831,7 +4843,7 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     {
         // If align fits in current IG, then mark that it contains alignment
         // instruction in the end.
-        emitCurIG->igFlags |= IGF_LOOP_ALIGN;
+        emitCurIG->igFlags |= IGF_HAS_ALIGN;
     }
 
     /* Insert a pseudo-instruction to ensure that we align
@@ -4842,12 +4854,12 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     {
         // Mark this IG has alignment in the end, so during emitter we can check the instruction count
         // heuristics of all IGs that follows this IG that participate in a loop.
-        emitCurIG->igFlags |= IGF_LOOP_ALIGN;
+        emitCurIG->igFlags |= IGF_HAS_ALIGN;
     }
     else
     {
         // Otherwise, make sure it was already marked such.
-        assert(emitCurIG->isLoopAlign());
+        assert(emitCurIG->endsWithAlignInstr());
     }
 
 #if defined(TARGET_XARCH)
@@ -4859,6 +4871,22 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
 
     id->idaIG = emitCurIG;
 
+    if (isFirstAlign)
+    {
+        // For multiple align instructions, set the idaLoopHeadPredIG only for the
+        // first align instruction
+        id->idaLoopHeadPredIG = emitCurIG;
+        emitAlignLastGroup    = id;
+    }
+    else
+    {
+        id->idaLoopHeadPredIG = nullptr;
+    }
+
+#ifdef DEBUG
+    id->isPlacedAfterJmp = isPlacedBehindJmp;
+#endif
+
     /* Append this instruction to this IG's alignment list */
     id->idaNext = emitCurIGAlignList;
 
@@ -4870,25 +4898,28 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
 
 //-----------------------------------------------------------------------------
 //
-//  The next instruction will be a loop head entry point
+//  emitLongLoopAlign: The next instruction will be a loop head entry point
 //  So insert alignment instruction(s) here to ensure that
 //  we can properly align the code.
 //
 //  This emits more than one `INS_align` instruction depending on the
 //  alignmentBoundary parameter.
 //
-void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
+//  Arguments:
+//      alignmentBoundary - The boundary at which loop needs to be aligned.
+//
+void emitter::emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp))
 {
 #if defined(TARGET_XARCH)
-    unsigned short nPaddingBytes    = alignmentBoundary - 1;
-    unsigned short nAlignInstr      = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
-    unsigned short insAlignCount    = nPaddingBytes / MAX_ENCODED_SIZE;
-    unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
-    unsigned short paddingBytes     = MAX_ENCODED_SIZE;
+    unsigned nPaddingBytes    = alignmentBoundary - 1;
+    unsigned nAlignInstr      = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
+    unsigned insAlignCount    = nPaddingBytes / MAX_ENCODED_SIZE;
+    unsigned lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
+    unsigned paddingBytes     = MAX_ENCODED_SIZE;
 #elif defined(TARGET_ARM64)
-    unsigned short nAlignInstr   = alignmentBoundary / INSTR_ENCODED_SIZE;
-    unsigned short insAlignCount = nAlignInstr;
-    unsigned short paddingBytes  = INSTR_ENCODED_SIZE;
+    unsigned nAlignInstr   = alignmentBoundary / INSTR_ENCODED_SIZE;
+    unsigned insAlignCount = nAlignInstr;
+    unsigned paddingBytes  = INSTR_ENCODED_SIZE;
 #endif
 
     emitCheckAlignFitInCurIG(nAlignInstr);
@@ -4896,25 +4927,49 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
     /* Insert a pseudo-instruction to ensure that we align
     the next instruction properly */
 
+    bool isFirstAlign = true;
     while (insAlignCount)
     {
-        emitLoopAlign(paddingBytes);
+        emitLoopAlign(paddingBytes, isFirstAlign DEBUG_ARG(isPlacedBehindJmp));
         insAlignCount--;
+        isFirstAlign = false;
     }
 
 #if defined(TARGET_XARCH)
-    emitLoopAlign(lastInsAlignSize);
+    emitLoopAlign(lastInsAlignSize, isFirstAlign DEBUG_ARG(isPlacedBehindJmp));
 #endif
 }
 
+//-----------------------------------------------------------------------------
+// emitConnectAlignInstrWithCurIG:  If "align" instruction is not just before the loop start,
+//                                  setting idaLoopHeadPredIG lets us know the exact IG that the "align"
+//                                  instruction is trying to align. This is used to track the last IG that
+//                                  needs alignment after which VEX encoding optimization is enabled.
+//
+//                                  TODO: Once over-estimation problem is solved, consider replacing
+//                                  idaLoopHeadPredIG with idaLoopHeadIG itself.
+//
+void emitter::emitConnectAlignInstrWithCurIG()
+{
+    JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitAlignLastGroup->idaIG->igNum,
+            emitCurIG->igNum);
+    // Since we never align overlapping instructions, it is always guaranteed that
+    // the emitAlignLastGroup points to the loop that is in process of getting aligned.
+
+    emitAlignLastGroup->idaLoopHeadPredIG = emitCurIG;
+
+    // For a new IG to ensure that loop doesn't start from IG that idaLoopHeadPredIG points to.
+    emitNxtIG();
+}
+
 //-----------------------------------------------------------------------------
 // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
-//                    mark it as IGF_LOOP_ALIGN to indicate that next IG  is a
-//                    loop needing alignment.
+//                    mark it as IGF_HAS_ALIGN to indicate that a next or a future
+//                    IG is a loop that needs alignment.
 //
-void emitter::emitLoopAlignment()
+void emitter::emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp))
 {
-    unsigned short paddingBytes;
+    unsigned paddingBytes;
 
 #if defined(TARGET_XARCH)
     // For xarch, each align instruction can be maximum of MAX_ENCODED_SIZE bytes and if
@@ -4922,13 +4977,13 @@ void emitter::emitLoopAlignment()
     if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
     {
         paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
-        emitLongLoopAlign(paddingBytes);
+        emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp));
     }
     else
     {
         emitCheckAlignFitInCurIG(1);
         paddingBytes = MAX_ENCODED_SIZE;
-        emitLoopAlign(paddingBytes);
+        emitLoopAlign(paddingBytes, true DEBUG_ARG(isPlacedBehindJmp));
     }
 #elif defined(TARGET_ARM64)
     // For Arm64, each align instruction is 4-bytes long because of fixed-length encoding.
@@ -4941,14 +4996,12 @@ void emitter::emitLoopAlignment()
     {
         paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
     }
-    emitLongLoopAlign(paddingBytes);
+    emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp));
 #endif
 
-    JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG));
+    assert(emitLastIns->idIns() == INS_align);
 
-#ifdef DEBUG
-    emitComp->loopAlignCandidates++;
-#endif // DEBUG
+    JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG));
 }
 
 //-----------------------------------------------------------------------------
@@ -4958,7 +5011,7 @@ void emitter::emitLoopAlignment()
 //
 bool emitter::emitEndsWithAlignInstr()
 {
-    return emitCurIG->isLoopAlign();
+    return emitCurIG->endsWithAlignInstr();
 }
 
 //-----------------------------------------------------------------------------
@@ -4982,9 +5035,13 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG
     for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext)
     {
         loopSize += igInLoop->igSize;
-        if (igInLoop->isLoopAlign())
+        if (igInLoop->endsWithAlignInstr())
         {
-            // If igInLoop is marked as "IGF_LOOP_ALIGN", the basic block flow detected a loop start.
+            // If IGF_HAS_ALIGN is present, igInLoop contains align instruction at the end,
+            // for next IG or some future IG.
+            //
+            // For both cases, remove the padding bytes from igInLoop's size so it is not included in loopSize.
+            //
             // If the loop was formed because of forward jumps like the loop IG18 below, the backedge is not
             // set for them and such loops are not aligned. For such cases, the loop size threshold will never
             // be met and we would break as soon as loopSize > maxLoopSize.
@@ -4997,9 +5054,9 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG
             //      ...
             //      jne IG05
             //
-            // If igInLoop is a legitimate loop, and igInLoop's next IG is also a loop that needs alignment,
-            // then igInLoop should be the last IG of the current loop and should have backedge to current
-            // loop header.
+            // If igInLoop is a legitimate loop, and igInLoop's end with another 'align' instruction for different IG
+            // representing a loop that needs alignment, then igInLoop should be the last IG of the current loop and
+            // should have backedge to current loop header.
             //
             // Below, IG05 is the last IG of loop IG04-IG05 and its backedge points to IG04.
             //
@@ -5149,25 +5206,33 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
             bool            markedCurrLoop = alignCurrentLoop;
             while ((alignInstr != nullptr))
             {
-                // Find the IG before current loop and clear the IGF_LOOP_ALIGN flag
-                if (!alignCurrentLoop && (alignInstr->idaIG->igNext == dstIG))
+                insGroup* loopHeadIG = alignInstr->loopHeadIG();
+
+                // Find the IG that has 'align' instruction to align the current loop
+                // and clear the IGF_HAS_ALIGN flag.
+                if (!alignCurrentLoop && (loopHeadIG == dstIG))
                 {
                     assert(!markedCurrLoop);
-                    alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
+
+                    // This IG should no longer contain alignment instruction
+                    alignInstr->removeAlignFlags();
+
                     markedCurrLoop = true;
                     JITDUMP("** Skip alignment for current loop IG%02u ~ IG%02u because it encloses an aligned loop "
                             "IG%02u ~ IG%02u.\n",
                             currLoopStart, currLoopEnd, emitLastLoopStart, emitLastLoopEnd);
                 }
 
-                // Find the IG before the last loop and clear the IGF_LOOP_ALIGN flag
-                if (!alignLastLoop && (alignInstr->idaIG->igNext != nullptr) &&
-                    (alignInstr->idaIG->igNext->igNum == emitLastLoopStart))
+                // Find the IG that has 'align' instruction to align the last loop
+                // and clear the IGF_HAS_ALIGN flag.
+                if (!alignLastLoop && (loopHeadIG != nullptr) && (loopHeadIG->igNum == emitLastLoopStart))
                 {
                     assert(!markedLastLoop);
-                    assert(alignInstr->idaIG->isLoopAlign());
+                    assert(alignInstr->idaIG->endsWithAlignInstr());
+
+                    // This IG should no longer contain alignment instruction
+                    alignInstr->removeAlignFlags();
 
-                    alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
                     markedLastLoop = true;
                     JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop "
                             "IG%02u ~ IG%02u.\n",
@@ -5179,21 +5244,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
                     break;
                 }
 
-#if defined(TARGET_XARCH)
-                if (!emitComp->opts.compJitAlignLoopAdaptive)
-#endif
-                {
-                    // If there are multiple align instructions, skip the align instructions after
-                    // the first align instruction and fast forward to the next IG
-                    insGroup* alignIG = alignInstr->idaIG;
-                    while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) &&
-                           (alignInstr->idaNext->idaIG == alignIG))
-                    {
-                        alignInstr = alignInstr->idaNext;
-                    }
-                }
-
-                alignInstr = alignInstr->idaNext;
+                alignInstr = emitAlignInNextIG(alignInstr);
             }
 
             assert(markedLastLoop && markedCurrLoop);
@@ -5232,21 +5283,32 @@ void emitter::emitLoopAlignAdjustments()
     unsigned        loopIGOffset      = 0;
     instrDescAlign* alignInstr        = emitAlignList;
 
-    for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext)
+    for (; alignInstr != nullptr;)
     {
         assert(alignInstr->idIns() == INS_align);
 
-        insGroup* alignIG = alignInstr->idaIG;
+        insGroup* loopHeadPredIG = alignInstr->idaLoopHeadPredIG;
+        insGroup* loopHeadIG     = alignInstr->loopHeadIG();
+        insGroup* containingIG   = alignInstr->idaIG;
+
+        JITDUMP("  Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum,
+                loopHeadIG->igNum);
 
-        loopIGOffset = alignIG->igOffs + alignIG->igSize;
+        // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure
+        // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved"
+
+        loopIGOffset = loopHeadIG->igOffs - alignBytesRemoved;
 
         // igSize also includes INS_align instruction, take it off.
         loopIGOffset -= estimatedPaddingNeeded;
 
         // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected
         // that the igLoopBackEdge encloses an IG that is marked for alignment.
+
         unsigned actualPaddingNeeded =
-            alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0;
+            containingIG->endsWithAlignInstr()
+                ? emitCalculatePaddingForLoopAlignment(loopHeadIG, loopIGOffset DEBUG_ARG(false))
+                : 0;
 
         assert(estimatedPaddingNeeded >= actualPaddingNeeded);
 
@@ -5254,15 +5316,15 @@ void emitter::emitLoopAlignAdjustments()
 
         if (diff != 0)
         {
-            alignIG->igSize -= diff;
+            containingIG->igSize -= diff;
             alignBytesRemoved += diff;
             emitTotalCodeSize -= diff;
 
             // Update the flags
-            alignIG->igFlags |= IGF_UPD_ISZ;
+            containingIG->igFlags |= IGF_UPD_ISZ;
             if (actualPaddingNeeded == 0)
             {
-                alignIG->igFlags &= ~IGF_LOOP_ALIGN;
+                alignInstr->removeAlignFlags();
             }
 
 #ifdef TARGET_XARCH
@@ -5312,21 +5374,19 @@ void emitter::emitLoopAlignAdjustments()
                 }
                 assert(paddingToAdj == 0);
                 assert(instrAdjusted == 0);
-
-                // fast forward the align instruction to next IG
-                alignInstr = prevAlignInstr;
             }
 
-            JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded,
+            JITDUMP("Adjusted alignment for %s from %u to %u.\n", emitLabelString(loopHeadIG), estimatedPaddingNeeded,
                     actualPaddingNeeded);
-            JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff),
-                    alignIG->igSize);
+            JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(containingIG),
+                    (containingIG->igSize + diff), containingIG->igSize);
         }
 
         // Adjust the offset of all IGs starting from next IG until we reach the IG having the next
         // align instruction or the end of IG list.
-        insGroup* adjOffIG     = alignIG->igNext;
-        insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast;
+        insGroup*       adjOffIG     = containingIG->igNext;
+        instrDescAlign* nextAlign    = emitAlignInNextIG(alignInstr);
+        insGroup*       adjOffUptoIG = nextAlign != nullptr ? nextAlign->idaIG : emitIGlast;
         while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum))
         {
             JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs,
@@ -5335,11 +5395,14 @@ void emitter::emitLoopAlignAdjustments()
             adjOffIG = adjOffIG->igNext;
         }
 
+        alignInstr = nextAlign;
+
         if (actualPaddingNeeded > 0)
         {
-            // Record the last IG that has align instruction. No overestimation
+            // Record the last loop IG that will be aligned. No overestimation
             // adjustment will be done after emitLastAlignedIgNum.
-            emitLastAlignedIgNum = alignIG->igNum;
+            JITDUMP("Recording last aligned IG: %s\n", emitLabelString(loopHeadPredIG));
+            emitLastAlignedIgNum = loopHeadPredIG->igNum;
         }
     }
 
@@ -5353,7 +5416,7 @@ void emitter::emitLoopAlignAdjustments()
 //    end of 'ig' so the loop that starts after 'ig' is aligned.
 //
 //  Arguments:
-//       ig              - The IG having 'align' instruction in the end.
+//       loopHeadIG      - The IG that has the loop head that need to be aligned.
 //       offset          - The offset at which the IG that follows 'ig' starts.
 //       isAlignAdjusted - Determine if adjustments are done to the align instructions or not.
 //                         During generating code, it is 'false' (because we haven't adjusted the size yet).
@@ -5381,15 +5444,15 @@ void emitter::emitLoopAlignAdjustments()
 //     3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned
 //     3c. return paddingNeeded.
 //
-unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted))
+unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG,
+                                                       size_t offset DEBUG_ARG(bool isAlignAdjusted))
 {
-    assert(ig->isLoopAlign());
     unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
 
     // No padding if loop is already aligned
     if ((offset & (alignmentBoundary - 1)) == 0)
     {
-        JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(ig->igNext),
+        JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(loopHeadIG),
                 alignmentBoundary);
         return 0;
     }
@@ -5409,12 +5472,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
         maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize;
     }
 
-    unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize DEBUG_ARG(isAlignAdjusted));
+    unsigned loopSize = getLoopSize(loopHeadIG, maxLoopSize DEBUG_ARG(isAlignAdjusted));
 
     // No padding if loop is big
     if (loopSize > maxLoopSize)
     {
-        JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(ig->igNext),
+        JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(loopHeadIG),
                 loopSize, maxLoopSize);
         return 0;
     }
@@ -5451,7 +5514,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
             {
                 skipPadding = true;
                 JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %uB boundary.'\n",
-                        emitLabelString(ig->igNext), alignmentBoundary);
+                        emitLabelString(loopHeadIG), alignmentBoundary);
             }
             // Check if the alignment exceeds new maxPadding limit
             else if (nPaddingBytes > nMaxPaddingBytes)
@@ -5459,7 +5522,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
                 skipPadding = true;
                 JITDUMP(";; Skip alignment: 'Loop at %s PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
                         "AlignmentBoundary= %dB.'\n",
-                        emitLabelString(ig->igNext), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary);
+                        emitLabelString(loopHeadIG), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary);
             }
         }
 
@@ -5482,7 +5545,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
             {
                 // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
                 JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n",
-                        emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary);
+                        emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary);
             }
         }
     }
@@ -5511,12 +5574,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
         {
             // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
             JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n",
-                    emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary);
+                    emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary);
         }
     }
 
     JITDUMP(";; Calculated padding to add %d bytes to align %s at %dB boundary.\n", paddingToAdd,
-            emitLabelString(ig->igNext), alignmentBoundary);
+            emitLabelString(loopHeadIG), alignmentBoundary);
 
     // Either no padding is added because it is too expensive or the offset gets aligned
     // to the alignment boundary
@@ -5525,6 +5588,25 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
     return paddingToAdd;
 }
 
+// emitAlignInNextIG: On xarch, for adaptive alignment, this will usually return the next instruction in
+//                    'emitAlignList'. But for arm64 or non-adaptive alignment on xarch, where multiple
+//                    align instructions are emitted, this method will skip the 'align' instruction present
+//                    in the same IG and return the first instruction that is present in next IG.
+//  Arguments:
+//      alignInstr - Current 'align' instruction for which next IG's first 'align' should be returned.
+//
+emitter::instrDescAlign* emitter::emitAlignInNextIG(instrDescAlign* alignInstr)
+{
+    // If there are multiple align instructions, skip the align instructions after
+    // the first align instruction and fast forward to the next IG
+    insGroup* alignIG = alignInstr->idaIG;
+    while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && (alignInstr->idaNext->idaIG == alignIG))
+    {
+        alignInstr = alignInstr->idaNext;
+    }
+    return alignInstr != nullptr ? alignInstr->idaNext : nullptr;
+}
+
 #endif // FEATURE_LOOP_ALIGN
 
 void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG)
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 22e9cfd6c79e09..c851df9df80a1a 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -275,8 +275,8 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
-#define IGF_LOOP_ALIGN 0x0400     // this group contains alignment instruction(s) at the end; the next IG is the
-                                  // head of a loop that needs alignment.
+#define IGF_HAS_ALIGN 0x0400      // this group contains an alignment instruction(s) at the end to align either the next
+                                  // IG, or, if this IG contains with an unconditional branch, some subsequent IG.
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
 // This allows prologs and epilogs to be any number of IGs, but still be
@@ -349,9 +349,9 @@ struct insGroup
         return *(unsigned*)ptr;
     }
 
-    bool isLoopAlign()
+    bool endsWithAlignInstr() const
     {
-        return (igFlags & IGF_LOOP_ALIGN) != 0;
+        return (igFlags & IGF_HAS_ALIGN) != 0;
     }
 
 }; // end of struct insGroup
@@ -1383,14 +1383,29 @@ class emitter
 #if FEATURE_LOOP_ALIGN
     struct instrDescAlign : instrDesc
     {
-        instrDescAlign* idaNext; // next align in the group/method
-        insGroup*       idaIG;   // containing group
-    };
+        instrDescAlign* idaNext;           // next align in the group/method
+        insGroup*       idaIG;             // containing group
+        insGroup*       idaLoopHeadPredIG; // The IG before the loop IG.
+                                           // If no 'jmp' instructions were found until idaLoopHeadPredIG,
+                                           // then idaLoopHeadPredIG == idaIG.
+#ifdef DEBUG
+        bool isPlacedAfterJmp; // Is the 'align' instruction placed after jmp. Used to decide
+                               // if the instruction cost should be included in PerfScore
+                               // calculation or not.
+#endif
 
-    void emitCheckAlignFitInCurIG(unsigned short nAlignInstr);
-    void emitLoopAlign(unsigned short paddingBytes);
-    void emitLongLoopAlign(unsigned short alignmentBoundary);
+        inline insGroup* loopHeadIG()
+        {
+            assert(idaLoopHeadPredIG);
+            return idaLoopHeadPredIG->igNext;
+        }
 
+        void removeAlignFlags()
+        {
+            idaIG->igFlags &= ~IGF_HAS_ALIGN;
+        }
+    };
+    void emitCheckAlignFitInCurIG(unsigned nAlignInstr);
 #endif // FEATURE_LOOP_ALIGN
 
 #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
@@ -1780,15 +1795,26 @@ class emitter
     unsigned        emitLastLoopStart;    // Start IG of last inner loop
     unsigned        emitLastLoopEnd;      // End IG of last inner loop
     unsigned        emitLastAlignedIgNum; // last IG that has align instruction
-    instrDescAlign* emitAlignList;        // list of local align instructions in method
+    instrDescAlign* emitAlignList;        // list of all align instructions in method
     instrDescAlign* emitAlignLast;        // last align instruction in method
+
+    // Points to the most recent added align instruction. If there are multiple align instructions like in arm64 or
+    // non-adaptive alignment on xarch, this points to the first align instruction of the series of align instructions.
+    instrDescAlign* emitAlignLastGroup;
+
     unsigned getLoopSize(insGroup* igLoopHeader,
                          unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size
-    void emitLoopAlignment();
+    void emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp));
     bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate
     void emitSetLoopBackEdge(BasicBlock* loopTopBlock);
     void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
     unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted));
+
+    void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp));
+    void emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp));
+    instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr);
+    void emitConnectAlignInstrWithCurIG();
+
 #endif
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 1ae7aa873a0aef..3de8faafc13c6d 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -11439,13 +11439,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 // IG can be marked as not needing alignment after emitting align instruction.
                 // Alternatively, there are fewer align instructions needed than emitted.
                 // If that is the case, skip outputting alignment.
-                if (!ig->isLoopAlign() || id->idIsEmptyAlign())
+                if (!ig->endsWithAlignInstr() || id->idIsEmptyAlign())
                 {
                     skipIns = true;
                 }
 
 #ifdef DEBUG
-                if (!ig->isLoopAlign())
+                if (!ig->endsWithAlignInstr())
                 {
                     // Validate if the state is correctly updated
                     assert(id->idIsEmptyAlign());
@@ -11453,6 +11453,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #endif
                 sz  = sizeof(instrDescAlign);
                 ins = INS_nop;
+
+#ifdef DEBUG
+                // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction,
+                // then add "bkpt" instruction.
+                instrDescAlign* alignInstr = (instrDescAlign*)id;
+
+                if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) &&
+                    (alignInstr->idaIG != alignInstr->idaLoopHeadPredIG) && !skipIns)
+                {
+                    // There is no good way to squeeze in "bkpt" as well as display it
+                    // in the disassembly because there is no corresponding instrDesc for
+                    // it. As such, leave it as is, the "0xD43E0000" bytecode will be seen
+                    // next to the nop instruction in disasm.
+                    // e.g. D43E0000          align   [4 bytes for IG07]
+                    ins = INS_BREAKPOINT;
+                }
+#endif
             }
 #endif // FEATURE_LOOP_ALIGN
 
@@ -13330,7 +13347,15 @@ void emitter::emitDispIns(
         case IF_SN_0A: // SN_0A   ................ ................
             if (ins == INS_align)
             {
-                printf("[%d bytes]", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE);
+                instrDescAlign* alignInstrId = (instrDescAlign*)id;
+                printf("[%d bytes", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE);
+
+                // targetIG is only set for 1st of the series of align instruction
+                if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr))
+                {
+                    printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum);
+                }
+                printf("]");
             }
             break;
 
@@ -14582,11 +14607,19 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
 
         case IF_SN_0A: // bkpt, brk, nop
-            if (id->idIsEmptyAlign())
+
+            if (id->idIns() == INS_align)
             {
-                // We're not going to generate any instruction, so it doesn't count for PerfScore.
-                result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
-                result.insLatency    = PERFSCORE_LATENCY_ZERO;
+                if ((id->idInsOpt() == INS_OPTS_NONE) || ((instrDescAlign*)id)->isPlacedAfterJmp)
+                {
+                    // Either we're not going to generate 'align' instruction, or the 'align'
+                    // instruction is placed immediately after unconditional jmp.
+                    // In both cases, don't count for PerfScore.
+
+                    result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
+                    result.insLatency    = PERFSCORE_LATENCY_ZERO;
+                    break;
+                }
             }
             else if (ins == INS_yield)
             {
@@ -14594,11 +14627,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                 result.insThroughput = PERFSCORE_THROUGHPUT_140C;
                 result.insLatency    = PERFSCORE_LATENCY_140C;
             }
-            else
-            {
-                result.insThroughput = PERFSCORE_THROUGHPUT_2X;
-                result.insLatency    = PERFSCORE_LATENCY_ZERO;
-            }
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_ZERO;
             break;
 
         case IF_SI_0B: // dmb, dsb, isb
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 00c403d08768d9..8246f2de377fb2 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -9830,7 +9830,14 @@ void emitter::emitDispIns(
 #if FEATURE_LOOP_ALIGN
             if (ins == INS_align)
             {
-                printf("[%d bytes]", id->idCodeSize());
+                instrDescAlign* alignInstrId = (instrDescAlign*)id;
+                printf("[%d bytes", alignInstrId->idCodeSize());
+                // targetIG is only set for 1st of the series of align instruction
+                if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr))
+                {
+                    printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum);
+                }
+                printf("]");
             }
 #endif
             break;
@@ -10018,32 +10025,74 @@ static BYTE* emitOutputNOP(BYTE* dstRW, size_t nBytes)
 //
 BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
 {
+    instrDescAlign* alignInstr = (instrDescAlign*)id;
+
+#ifdef DEBUG
+    // For cases where 'align' was placed behing a 'jmp' in an IG that does not
+    // immediately preced the loop IG, we do not know in advance the offset of
+    // IG having loop. For such cases, skip the padding calculation validation.
+    bool validatePadding = alignInstr->idaIG == alignInstr->idaLoopHeadPredIG;
+#endif
+
     // Candidate for loop alignment
     assert(codeGen->ShouldAlignLoops());
-    assert(ig->isLoopAlign());
+    assert(ig->endsWithAlignInstr());
 
     unsigned paddingToAdd = id->idCodeSize();
 
     // Either things are already aligned or align them here.
-    assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0));
+    assert(!validatePadding || (paddingToAdd == 0) ||
+           (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0));
 
     // Padding amount should not exceed the alignment boundary
     assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary);
 
 #ifdef DEBUG
-    unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, true);
-
-    // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking
-    if (emitComp->opts.compJitAlignLoopAdaptive)
+    if (validatePadding)
     {
-        assert(paddingToAdd == paddingNeeded);
+        unsigned paddingNeeded =
+            emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG->igNext, (size_t)dst, true);
+
+        // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking
+        if (emitComp->opts.compJitAlignLoopAdaptive)
+        {
+            assert(paddingToAdd == paddingNeeded);
+        }
     }
 
     emitComp->loopsAligned++;
 #endif
 
     BYTE* dstRW = dst + writeableOffset;
-    dstRW       = emitOutputNOP(dstRW, paddingToAdd);
+
+#ifdef DEBUG
+    // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction,
+    // then add "int3" instruction. Since int3 takes 1 byte, we would only add
+    // it if paddingToAdd >= 1 byte.
+
+    if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && !validatePadding && paddingToAdd >= 1)
+    {
+        size_t int3Code = insCodeMR(INS_BREAKPOINT);
+        // There is no good way to squeeze in "int3" as well as display it
+        // in the disassembly because there is no corresponding instrDesc for
+        // it. As such, leave it as is, the "0xCC" bytecode will be seen next
+        // to the nop instruction in disasm.
+        // e.g. CC                   align    [1 bytes for IG29]
+        //
+        // if (emitComp->opts.disAsm)
+        //{
+        //    emitDispInsAddr(dstRW);
+
+        //    emitDispInsOffs(0, false);
+
+        //    printf("                      %-9s  ; stress-mode injected interrupt\n", "int3");
+        //}
+        dstRW += emitOutputByte(dstRW, int3Code);
+        paddingToAdd -= 1;
+    }
+#endif
+
+    dstRW = emitOutputNOP(dstRW, paddingToAdd);
     return dstRW - writeableOffset;
 }
 
@@ -13373,7 +13422,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 sz = sizeof(instrDescAlign);
                 // IG can be marked as not needing alignment after emitting align instruction
                 // In such case, skip outputting alignment.
-                if (ig->isLoopAlign())
+                if (ig->endsWithAlignInstr())
                 {
                     dst = emitOutputAlign(ig, id, dst);
                 }
@@ -14823,9 +14872,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
     {
         case INS_align:
 #if FEATURE_LOOP_ALIGN
-            if (id->idCodeSize() == 0)
+            if ((id->idCodeSize() == 0) || ((instrDescAlign*)id)->isPlacedAfterJmp)
             {
-                // We're not going to generate any instruction, so it doesn't count for PerfScore.
+                // Either we're not going to generate 'align' instruction, or the 'align'
+                // instruction is placed immediately after unconditional jmp.
+                // In both cases, don't count for PerfScore.
+
                 result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
                 result.insLatency    = PERFSCORE_LATENCY_ZERO;
                 break;
diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp
index 0f4631afd11ad1..04cf8cdf3cec98 100644
--- a/src/coreclr/jit/fgbasic.cpp
+++ b/src/coreclr/jit/fgbasic.cpp
@@ -4649,6 +4649,7 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
 
             if (block->isLoopAlign())
             {
+                loopAlignCandidates++;
                 succBlock->bbFlags |= BBF_LOOP_ALIGN;
                 JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum,
                         succBlock->bbNum, block->bbNatLoopNum);
@@ -4801,6 +4802,9 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
         block->bbFlags |= BBF_REMOVED;
     }
 
+    // If this was marked for alignment, remove it
+    block->unmarkLoopAlign(this DEBUG_ARG("Removed block"));
+
     if (bPrev != nullptr)
     {
         switch (bPrev->bbJumpKind)
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 658ba2a69186aa..82a70f01e7a007 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -67,6 +67,11 @@ CONFIG_INTEGER(JitAlignLoopAdaptive,
                W("JitAlignLoopAdaptive"),
                1) // If set, perform adaptive loop alignment that limits number of padding based on loop size.
 
+CONFIG_INTEGER(JitHideAlignBehindJmp,
+               W("JitHideAlignBehindJmp"),
+               1) // If set, try to hide align instruction (if any) behind an unconditional jump instruction (if any)
+                  // that is present before the loop start.
+
 // Print the alignment boundaries in disassembly.
 CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0)
 
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 4c18c7f9188222..3840aec9208268 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -15320,11 +15320,8 @@ bool Compiler::fgFoldConditional(BasicBlock* block)
                          * Remove the loop from the table */
 
                         optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED;
-#if FEATURE_LOOP_ALIGN
-                        optLoopTable[loopNum].lpTop->bbFlags &= ~BBF_LOOP_ALIGN;
-                        JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n",
-                                optLoopTable[loopNum].lpTop->bbNum);
-#endif
+
+                        optLoopTable[loopNum].lpTop->unmarkLoopAlign(this DEBUG_ARG("Bogus loop"));
 
 #ifdef DEBUG
                         if (verbose)
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index e3f0218bd7ba8b..7378561146d561 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -20,16 +20,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
 void Compiler::optInit()
 {
-    optLoopsMarked = false;
-    fgHasLoops     = false;
+    optLoopsMarked      = false;
+    fgHasLoops          = false;
+    loopAlignCandidates = 0;
 
     /* Initialize the # of tracked loops to 0 */
     optLoopCount = 0;
     optLoopTable = nullptr;
 
 #ifdef DEBUG
-    loopAlignCandidates = 0;
-    loopsAligned        = 0;
+    loopsAligned = 0;
 #endif
 
     /* Keep track of the number of calls and indirect calls made by this method */
@@ -379,14 +379,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk)
 
     JITDUMP("\n");
 
-#if FEATURE_LOOP_ALIGN
-    if (begBlk->isLoopAlign())
-    {
-        // Clear the loop alignment bit on the head of a loop, since it's no longer a loop.
-        begBlk->bbFlags &= ~BBF_LOOP_ALIGN;
-        JITDUMP("Removing LOOP_ALIGN flag from removed loop in " FMT_BB "\n", begBlk->bbNum);
-    }
-#endif
+    begBlk->unmarkLoopAlign(this DEBUG_ARG("Removed loop"));
 }
 
 /*****************************************************************************************************
@@ -2516,9 +2509,15 @@ void Compiler::optIdentifyLoopsForAlignment()
                 weight_t    topWeight = top->getBBWeight(this);
                 if (topWeight >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT))
                 {
-                    top->bbFlags |= BBF_LOOP_ALIGN;
-                    JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd,
-                            top->bbNum, topWeight);
+                    // Sometimes with JitOptRepeat > 1, we might end up finding the loops twice. In such
+                    // cases, make sure to count them just once.
+                    if (!top->isLoopAlign())
+                    {
+                        loopAlignCandidates++;
+                        top->bbFlags |= BBF_LOOP_ALIGN;
+                        JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd,
+                                top->bbNum, top->getBBWeight(this));
+                    }
                 }
                 else
                 {
@@ -3779,11 +3778,7 @@ PhaseStatus Compiler::optUnrollLoops()
 #if FEATURE_LOOP_ALIGN
         for (block = head->bbNext;; block = block->bbNext)
         {
-            if (block->isLoopAlign())
-            {
-                block->bbFlags &= ~BBF_LOOP_ALIGN;
-                JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum);
-            }
+            block->unmarkLoopAlign(this DEBUG_ARG("Unrolled loop"));
 
             if (block == bottom)
             {
@@ -7617,9 +7612,8 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
     if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP)
     {
         BasicBlock* top = optLoopTable[lnum].lpTop;
-        top->bbFlags &= ~BBF_LOOP_ALIGN;
-        JITDUMP("Removing LOOP_ALIGN flag for " FMT_LP " that starts at " FMT_BB " because loop has a call.\n", lnum,
-                top->bbNum);
+
+        top->unmarkLoopAlign(this DEBUG_ARG("Loop with call"));
     }
 #endif