From ba3088ac45f6d3e4ba0411c885f72cd39675d828 Mon Sep 17 00:00:00 2001
From: Qiao Pengcheng <qiaopengcheng@loongson.cn>
Date: Thu, 18 Apr 2024 16:31:50 +0800
Subject: [PATCH 1/4] Adjust the calleeSavedRegs on top frame for
 LoongArch64/RISCV64 to support the GSCookie. The frame layout:    |          
             |    |-----------------------|    |  incoming arguments   |   
 +=======================+ <---- Caller's SP    |  Varargs regs space   | //
 Only for varargs main functions; not used for LA64.   
 |-----------------------|    |    MonitorAcquired    | // 8 bytes; for
 synchronized methods    |-----------------------|    |        PSP slot      
 | // 8 bytes (omitted in NativeAOT ABI)    |-----------------------|   
 |Callee saved registers | // multiple of 8 bytes, not includting FP/RA   
 |-----------------------|    |      Saved RA         | // 8 bytes   
 |-----------------------|    |      Saved FP         | // 8 bytes   
 |-----------------------|    |  possible GS cookie   |   
 |-----------------------|    | locals, temps, etc.   |   
 |-----------------------|    |  possible GS cookie   |   
 |-----------------------|    |   Outgoing arg space  | // multiple of 8
 bytes; if required (i.e., #outsz != 0)    |-----------------------| <----
 Ambient SP    |       |               |    ~       | Stack grows   ~    |    
   | downward      |

---
 src/coreclr/jit/codegen.h              |  23 +-
 src/coreclr/jit/codegencommon.cpp      |  51 ++-
 src/coreclr/jit/codegenloongarch64.cpp | 587 +++++++++----------------
 src/coreclr/jit/codegenriscv64.cpp     | 485 ++++++++------------
 src/coreclr/jit/compiler.cpp           |   8 +-
 src/coreclr/jit/lclvars.cpp            |  59 ++-
 src/coreclr/jit/regset.h               |   3 +-
 7 files changed, 487 insertions(+), 729 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 3511935a062b0a..f5e0dc857a7d4c 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -437,7 +437,7 @@ class CodeGen final : public CodeGenInterface
 
     FuncletFrameInfoDsc genFuncletInfo;
 
-#elif defined(TARGET_LOONGARCH64)
+#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
 
     // A set of information that is used by funclet prolog and epilog generation.
     // It is collected once, before funclet prologs and epilogs are generated,
@@ -448,26 +448,6 @@ class CodeGen final : public CodeGenInterface
         int fiFunction_CallerSP_to_FP_delta; // Delta between caller SP and the frame pointer in the parent function
                                              // (negative)
         int fiSP_to_CalleeSaved_delta;       // CalleeSaved register save offset from SP (positive)
-        int fiCalleeSavedPadding;            // CalleeSaved offset padding (positive)
-        int fiSP_to_PSP_slot_delta;          // PSP slot offset from SP (positive)
-        int fiCallerSP_to_PSP_slot_delta;    // PSP slot offset from Caller SP (negative)
-        int fiSpDelta;                       // Stack pointer delta (negative)
-    };
-
-    FuncletFrameInfoDsc genFuncletInfo;
-
-#elif defined(TARGET_RISCV64)
-
-    // A set of information that is used by funclet prolog and epilog generation.
-    // It is collected once, before funclet prologs and epilogs are generated,
-    // and used by all funclet prologs and epilogs, which must all be the same.
-    struct FuncletFrameInfoDsc
-    {
-        regMaskTP fiSaveRegs;                // Set of callee-saved registers saved in the funclet prolog (includes RA)
-        int fiFunction_CallerSP_to_FP_delta; // Delta between caller SP and the frame pointer in the parent function
-                                             // (negative)
-        int fiSP_to_CalleeSaved_delta;       // CalleeSaved register save offset from SP (positive)
-        int fiCalleeSavedPadding;            // CalleeSaved offset padding (positive)
         int fiSP_to_PSP_slot_delta;          // PSP slot offset from SP (positive)
         int fiCallerSP_to_PSP_slot_delta;    // PSP slot offset from Caller SP (negative)
         int fiSpDelta;                       // Stack pointer delta (negative)
@@ -1272,7 +1252,6 @@ class CodeGen final : public CodeGenInterface
     void        genJmpMethod(GenTree* jmp);
     BasicBlock* genCallFinally(BasicBlock* block);
 #if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    // TODO: refactor for LA.
     void genCodeForJumpCompare(GenTreeOpCC* tree);
 #endif
 #if defined(TARGET_ARM64)
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 417a7e6f31695d..98634c890530b2 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -4086,7 +4086,7 @@ void CodeGen::genEnregisterOSRArgsAndLocals()
 
         GetEmitter()->emitIns_R_AR(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset);
 
-#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+#elif defined(TARGET_ARM64)
 
         // Patchpoint offset is from top of Tier0 frame
         //
@@ -4118,7 +4118,37 @@ void CodeGen::genEnregisterOSRArgsAndLocals()
 
         genInstrWithConstant(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset, initReg);
         *pInitRegZeroed = false;
-#endif // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64
+#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+
+        // Patchpoint offset is from top of Tier0 frame
+        //
+        // We need to determine the frame-pointer relative
+        // offset for this variable in the osr frame.
+        //
+        // First there is no need to ajust stkOffs
+        // as it relative to sp within Tier0 frame
+
+        // then add the OSR frame size
+        //
+        const int osrFrameSize = genTotalFrameSize();
+
+        // then subtract OSR SP-FP delta
+        //
+        const int osrSpToFpDelta = genSPtoFPdelta();
+
+        //                 | => tier0 top of frame relative
+        //                 |         + => osr bottom of frame (sp) relative
+        //                 |         |              - => osr fp relative
+        //                 |         |              |
+        const int offset = stkOffs + osrFrameSize - osrSpToFpDelta;
+
+        JITDUMP("---OSR--- V%02u (reg) Tier0 virtual offset %d OSR frame size %d OSR sp-fp "
+                "delta %d total offset %d (0x%x)\n",
+                varNum, stkOffs, osrFrameSize, osrSpToFpDelta, offset, offset);
+
+        genInstrWithConstant(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset, initReg);
+        *pInitRegZeroed = false;
+#endif // TARGET_LOONGARCH64 || TARGET_RISCV64
     }
 }
 
@@ -4744,20 +4774,13 @@ void CodeGen::genFinalizeFrame()
 #endif // defined(TARGET_XARCH)
 
 #if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    if (isFramePointerUsed())
-    {
-        // For a FP based frame we have to push/pop the FP register
-        //
-        maskCalleeRegsPushed |= RBM_FPBASE;
+    // This assert check that we are not using REG_FP
+    assert(!regSet.rsRegsModified(RBM_FPBASE));
 
-        // This assert check that we are not using REG_FP
-        // as both the frame pointer and as a codegen register
-        //
-        assert(!regSet.rsRegsModified(RBM_FPBASE));
-    }
+    assert(isFramePointerUsed());
+    // we always push FP/RA.  See genPushCalleeSavedRegisters
+    maskCalleeRegsPushed |= (RBM_FPBASE | RBM_RA);
 
-    // we always push RA.  See genPushCalleeSavedRegisters
-    maskCalleeRegsPushed |= RBM_RA;
 #endif // TARGET_LOONGARCH64 || TARGET_RISCV64
 
     compiler->compCalleeRegsPushed = genCountBits(maskCalleeRegsPushed);
diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp
index 2954a989c74668..fd5114ce06153e 100644
--- a/src/coreclr/jit/codegenloongarch64.cpp
+++ b/src/coreclr/jit/codegenloongarch64.cpp
@@ -190,8 +190,7 @@ void CodeGen::genStackPointerAdjustment(ssize_t spDelta, regNumber tmpReg, bool*
 //    reg1                     - First register of pair to save.
 //    reg2                     - Second register of pair to save.
 //    spOffset                 - The offset from SP to store reg1 (must be positive or zero).
-//    spDelta                  - If non-zero, the amount to add to SP before the register saves (must be negative or
-//                               zero).
+//    spDelta                  - Always zero for LoongArch64 now.
 //    useSaveNextPair          - True if the last prolog instruction was to save the previous register pair. This
 //                               allows us to emit the "save_next" unwind code.
 //    tmpReg                   - An available temporary register. Needed for the case of large frames.
@@ -210,8 +209,7 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1,
                                    bool*     pTmpRegIsZero)
 {
     assert(spOffset >= 0);
-    assert(spDelta <= 0);
-    assert((spDelta % 16) == 0);                                  // SP changes must be 16-byte aligned
+    assert(spDelta == 0);
     assert(genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)); // registers must be both general-purpose, or both
                                                                   // FP/SIMD
 
@@ -221,16 +219,6 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1,
         ins = INS_fst_d;
     }
 
-    if (spDelta != 0)
-    {
-        // generate addi.d SP,SP,-imm
-        genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true);
-
-        assert((spDelta + spOffset + 16) <= 0);
-
-        assert(spOffset <= 2031); // 2047-16
-    }
-
     GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
     compiler->unwindSaveReg(reg1, spOffset);
 
@@ -249,8 +237,7 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1,
 // Arguments:
 //    reg1                     - Register to save.
 //    spOffset                 - The offset from SP to store reg1 (must be positive or zero).
-//    spDelta                  - If non-zero, the amount to add to SP before the register saves (must be negative or
-//                               zero).
+//    spDelta                  - Always zero for LoongArch64 now.
 //    tmpReg                   - An available temporary register. Needed for the case of large frames.
 //    pTmpRegIsZero            - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
 //                               Otherwise, we don't touch it.
@@ -261,8 +248,7 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1,
 void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
 {
     assert(spOffset >= 0);
-    assert(spDelta <= 0);
-    assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
+    assert(spDelta == 0);
 
     instruction ins = INS_st_d;
     if (genIsValidFloatReg(reg1))
@@ -270,12 +256,6 @@ void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNum
         ins = INS_fst_d;
     }
 
-    if (spDelta != 0)
-    {
-        // generate addi.d SP,SP,-imm
-        genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true);
-    }
-
     GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
     compiler->unwindSaveReg(reg1, spOffset);
 }
@@ -290,8 +270,7 @@ void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNum
 //    reg1                     - First register of pair to restore.
 //    reg2                     - Second register of pair to restore.
 //    spOffset                 - The offset from SP to load reg1 (must be positive or zero).
-//    spDelta                  - If non-zero, the amount to add to SP after the register restores (must be positive or
-//                               zero).
+//    spDelta                  - Always zero for LoongArch64 now.
 //    useSaveNextPair          - True if the last prolog instruction was to save the previous register pair. This
 //                               allows us to emit the "save_next" unwind code.
 //    tmpReg                   - An available temporary register. Needed for the case of large frames.
@@ -310,8 +289,7 @@ void CodeGen::genEpilogRestoreRegPair(regNumber reg1,
                                       bool*     pTmpRegIsZero)
 {
     assert(spOffset >= 0);
-    assert(spDelta >= 0);
-    assert((spDelta % 16) == 0);                                  // SP changes must be 16-byte aligned
+    assert(spDelta == 0);
     assert(genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)); // registers must be both general-purpose, or both
                                                                   // FP/SIMD
 
@@ -321,27 +299,11 @@ void CodeGen::genEpilogRestoreRegPair(regNumber reg1,
         ins = INS_fld_d;
     }
 
-    if (spDelta != 0)
-    {
-        assert(!useSaveNextPair);
-
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8);
-        compiler->unwindSaveReg(reg2, spOffset + 8);
-
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
-        compiler->unwindSaveReg(reg1, spOffset);
-
-        // generate addi.d SP,SP,imm
-        genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true);
-    }
-    else
-    {
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8);
-        compiler->unwindSaveReg(reg2, spOffset + 8);
+    GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8);
+    compiler->unwindSaveReg(reg2, spOffset + 8);
 
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
-        compiler->unwindSaveReg(reg1, spOffset);
-    }
+    GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
+    compiler->unwindSaveReg(reg1, spOffset);
 }
 
 //------------------------------------------------------------------------
@@ -350,8 +312,7 @@ void CodeGen::genEpilogRestoreRegPair(regNumber reg1,
 // Arguments:
 //    reg1                     - Register to restore.
 //    spOffset                 - The offset from SP to restore reg1 (must be positive or zero).
-//    spDelta                  - If non-zero, the amount to add to SP after the register restores (must be positive or
-//                               zero).
+//    spDelta                  - Always zero for LoongArch64 now.
 //    tmpReg                   - An available temporary register. Needed for the case of large frames.
 //    pTmpRegIsZero            - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
 //                               Otherwise, we don't touch it.
@@ -362,8 +323,7 @@ void CodeGen::genEpilogRestoreRegPair(regNumber reg1,
 void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
 {
     assert(spOffset >= 0);
-    assert(spDelta >= 0);
-    assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
+    assert(spDelta == 0);
 
     instruction ins = INS_ld_d;
     if (genIsValidFloatReg(reg1))
@@ -371,20 +331,8 @@ void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, reg
         ins = INS_fld_d;
     }
 
-    if (spDelta != 0)
-    {
-        // ld.d reg1,SP,offset
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
-        compiler->unwindSaveReg(reg1, spOffset);
-
-        // generate addi.d SP,SP,imm
-        genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true);
-    }
-    else
-    {
-        GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
-        compiler->unwindSaveReg(reg1, spOffset);
-    }
+    GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
+    compiler->unwindSaveReg(reg1, spOffset);
 }
 
 //------------------------------------------------------------------------
@@ -519,12 +467,13 @@ int CodeGen::genGetSlotSizeForRegsInMask(regMaskTP regsMask)
 // genSaveCalleeSavedRegisterGroup: Saves the group of registers described by the mask.
 //
 // Arguments:
-//   regsMask             - a mask of registers for prolog generation;
-//   spDelta              - if non-zero, the amount to add to SP before the first register save (or together with it);
-//   spOffset             - the offset from SP that is the beginning of the callee-saved register area;
+//   regsMask  - a mask of registers for prolog generation;
+//   spDelta   - Always zero for LoongArch64 now.
+//   spOffset  - the offset from SP that is the beginning of the callee-saved register area;
 //
 void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset)
 {
+    assert(spDelta == 0);
     const int slotSize = genGetSlotSizeForRegsInMask(regsMask);
 
     ArrayStack<RegPair> regStack(compiler->getAllocator(CMK_Codegen));
@@ -536,19 +485,16 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i
         if (regPair.reg2 != REG_NA)
         {
             // We can use two SD instructions.
-            genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_R21,
-                                 nullptr);
+            genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, 0, regPair.useSaveNextPair, REG_R21, nullptr);
 
             spOffset += 2 * slotSize;
         }
         else
         {
             // No register pair; we use a SD instruction.
-            genPrologSaveReg(regPair.reg1, spOffset, spDelta, REG_R21, nullptr);
+            genPrologSaveReg(regPair.reg1, spOffset, 0, REG_R21, nullptr);
             spOffset += slotSize;
         }
-
-        spDelta = 0; // We've now changed SP already, if necessary; don't do it again.
     }
 }
 
@@ -574,34 +520,22 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i
 //
 // Arguments:
 //    regsToSaveMask          - The mask of callee-saved registers to save. If empty, this function does nothing.
-//    lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. Note that
-//                              if non-zero spDelta, then this is the offset of the first save *after* that
-//                              SP adjustment.
-//    spDelta                 - If non-zero, the amount to add to SP before the register saves (must be negative or
-//                              zero).
+//    lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area.
+//    spDelta                 - Always zero for LoongArch64 now.
 //
 // Notes:
 //    The save set can not contain FP/RA in which case FP/RA is saved along with the other callee-saved registers.
 //
 void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowestCalleeSavedOffset, int spDelta)
 {
-    assert(spDelta <= 0);
+    assert(spDelta == 0);
 
-    unsigned regsToSaveCount = genCountBits(regsToSaveMask);
-    if (regsToSaveCount == 0)
+    if (regsToSaveMask == 0)
     {
-        if (spDelta != 0)
-        {
-            // Currently this is the case for varargs only
-            // whose size is MAX_REG_ARG * REGSIZE_BYTES = 64 bytes.
-            genStackPointerAdjustment(spDelta, REG_R21, nullptr, /* reportUnwindData */ true);
-        }
         return;
     }
 
-    assert((spDelta % 16) == 0);
-
-    assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED));
+    assert(genCountBits(regsToSaveMask) <= genCountBits(RBM_CALLEE_SAVED));
 
     // Save integer registers at higher addresses than floating-point registers.
 
@@ -610,15 +544,14 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
 
     if (maskSaveRegsFloat != RBM_NONE)
     {
-        genSaveCalleeSavedRegisterGroup(maskSaveRegsFloat, spDelta, lowestCalleeSavedOffset);
-        spDelta = 0;
+        genSaveCalleeSavedRegisterGroup(maskSaveRegsFloat, 0, lowestCalleeSavedOffset);
         lowestCalleeSavedOffset += genCountBits(maskSaveRegsFloat) * FPSAVE_REGSIZE_BYTES;
     }
 
     if (maskSaveRegsInt != RBM_NONE)
     {
-        genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset);
-        // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this.
+        // No need to update spDelta.
+        genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, 0, lowestCalleeSavedOffset);
     }
 }
 
@@ -627,11 +560,12 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
 //
 // Arguments:
 //   regsMask             - a mask of registers for epilog generation;
-//   spDelta              - if non-zero, the amount to add to SP after the last register restore (or together with it);
+//   spDelta              - Always zero for LoongArch64 now.
 //   spOffset             - the offset from SP that is the beginning of the callee-saved register area;
 //
 void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset)
 {
+    assert(spDelta == 0);
     const int slotSize = genGetSlotSizeForRegsInMask(regsMask);
 
     ArrayStack<RegPair> regStack(compiler->getAllocator(CMK_Codegen));
@@ -640,15 +574,6 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
     int stackDelta = 0;
     for (int i = 0; i < regStack.Height(); ++i)
     {
-        bool lastRestoreInTheGroup = (i == regStack.Height() - 1);
-        bool updateStackDelta      = lastRestoreInTheGroup && (spDelta != 0);
-        if (updateStackDelta)
-        {
-            // Update stack delta only if it is the last restore (the first save).
-            assert(stackDelta == 0);
-            stackDelta = spDelta;
-        }
-
         RegPair regPair = regStack.Top(i);
         if (regPair.reg2 != REG_NA)
         {
@@ -670,10 +595,9 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
 // in the function or funclet epilog. This exactly reverses the actions of genSaveCalleeSavedRegistersHelp().
 //
 // Arguments:
-//    regsToRestoreMask       - The mask of callee-saved registers to restore. If empty, this function does nothing.
-//    lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area.
-//    spDelta                 - If non-zero, the amount to add to SP after the register restores (must be positive or
-//                              zero).
+//   regsToRestoreMask       - The mask of callee-saved registers to restore. If empty, this function does nothing.
+//   lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area.
+//   spDelta                 - Always zero for LoongArch64 now.
 //
 // Here's an example restore sequence:
 //      ld.d    s8,sp,#xxx
@@ -694,23 +618,15 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
 
 void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, int lowestCalleeSavedOffset, int spDelta)
 {
-    assert(spDelta >= 0);
-    unsigned regsToRestoreCount = genCountBits(regsToRestoreMask);
-    if (regsToRestoreCount == 0)
+    assert(spDelta == 0);
+    if (regsToRestoreMask == 0)
     {
-        if (spDelta != 0)
-        {
-            // Currently this is the case for varargs only
-            // whose size is MAX_REG_ARG * REGSIZE_BYTES = 64 bytes.
-            genStackPointerAdjustment(spDelta, REG_R21, nullptr, /* reportUnwindData */ true);
-        }
         return;
     }
 
-    assert((spDelta % 16) == 0);
-
-    // We also can restore FP and RA, even though they are not in RBM_CALLEE_SAVED.
-    assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED | RBM_FP | RBM_RA));
+    unsigned regsToRestoreCount = genCountBits(regsToRestoreMask);
+    // The FP and RA are not in RBM_CALLEE_SAVED.
+    assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED));
 
     // Point past the end, to start. We predecrement to find the offset to load from.
     static_assert_no_msg(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES);
@@ -725,15 +641,13 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
 
     if (maskRestoreRegsInt != RBM_NONE)
     {
-        int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment?
-        genRestoreCalleeSavedRegisterGroup(maskRestoreRegsInt, spIntDelta, spOffset);
+        genRestoreCalleeSavedRegisterGroup(maskRestoreRegsInt, 0, spOffset);
         spOffset -= genCountBits(maskRestoreRegsInt) * REGSIZE_BYTES;
     }
 
     if (maskRestoreRegsFloat != RBM_NONE)
     {
-        // If there is any spDelta, it must be used here.
-        genRestoreCalleeSavedRegisterGroup(maskRestoreRegsFloat, spDelta, spOffset);
+        genRestoreCalleeSavedRegisterGroup(maskRestoreRegsFloat, 0, spOffset);
         // No need to update spOffset since it's not used after this.
     }
 }
@@ -755,7 +669,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *     filter:         a0 = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT)
  *     finally/fault:  none
  *
- *  The LOONGARCH64 funclet prolog is the following (Note: #framesz is total funclet frame size,
+ *  The LoongArch64 funclet prolog is the following (Note: #framesz is total funclet frame size,
  *  including everything; #outsz is outgoing argument space. #framesz must be a multiple of 16):
  *
  *  Frame type liking:
@@ -771,19 +685,17 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |-----------------------|
  *      |  incoming arguments   |
  *      +=======================+ <---- Caller's SP
- *      |      OSR padding      | // If required
- *      |-----------------------|
- *      |  Varargs regs space   | // Only for varargs main functions; 64 bytes
+ *      |  Varargs regs space   | // Only for varargs main functions; not used for LA64.
  *      |-----------------------|
  *      |    MonitorAcquired    | // 8 bytes; for synchronized methods
  *      |-----------------------|
  *      |        PSP slot       | // 8 bytes (omitted in NativeAOT ABI)
  *      |-----------------------|
- *      ~  alignment padding    ~ // To make the whole frame 16 byte aligned
+ *      |Callee saved registers | // multiple of 8 bytes, not includting FP/RA
  *      |-----------------------|
  *      |      Saved FP, RA     | // 16 bytes
  *      |-----------------------|
- *      |Callee saved registers | // multiple of 8 bytes, not includting FP/RA
+ *      ~  alignment padding    ~ // To make the whole frame 16 byte aligned
  *      |-----------------------|
  *      |   Outgoing arg space  | // multiple of 8 bytes; if required (i.e., #outsz != 0)
  *      |-----------------------| <---- Ambient SP
@@ -793,38 +705,22 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *              V
  *
  *
- * Both #1 and #2 only change SP once. That means that there will be a maximum of one alignment slot needed. For the general case, #3,
- * it is possible that we will need to add alignment to both changes to SP, leading to 16 bytes of alignment. Remember that the stack
- * pointer needs to be 16 byte aligned at all times. The size of the PSP slot plus callee-saved registers space is a maximum of 232 bytes:
- *
- *     FP,RA registers
- *     9 int callee-saved register s0-s8
- *     8 float callee-saved registers f24-f31
- *     8 saved integer argument registers a0-a7, if varargs function support.
- *     1 PSP slot
- *     == 20 slots * 8 bytes = 160 bytes.
- *
  * The outgoing argument size, however, can be very large, if we call a function that takes a large number of
  * arguments (note that we currently use the same outgoing argument space size in the funclet as for the main
  * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of
- * outgoing arguments for any call). In that case, we need to 16-byte align the initial change to SP, before
- * saving off the callee-saved registers and establishing the PSPsym, so we can use the limited immediate offset
- * encodings we have available, before doing another 16-byte aligned SP adjustment to create the outgoing argument
- * space. Both changes to SP might need to add alignment padding.
- *
- * In addition to the above "standard" frames, we also need to support a frame where the saved FP/RA are at the
- * highest addresses. This is to match the frame layout (specifically, callee-saved registers including FP/RA
- * and the PSPSym) that is used in the main function when a GS cookie is required due to the use of localloc.
- * (Note that localloc cannot be used in a funclet.) In these variants, not only has the position of FP/RA
- * changed, but where the alignment padding is placed has also changed.
+ * outgoing arguments for any call).
  *
- *
- * Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP, and that location is the same relative to Caller-SP
- * as in the main function.
+ * Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP,
+ * and that location is the same relative to Caller-SP as in the main function where higher than
+ * the callee-saved registers.
+ * That is to say, the PSPSym's relative offset to Caller-SP is not depended on the callee-saved registers.
+ * TODO-LoongArch64: the funclet's callee-saved registers should not shared with main function.
  *
  * Funclets do not have varargs arguments. However, because the PSPSym must exist at the same offset from Caller-SP as in the main function, we
  * must add buffer space for the saved varargs/argument registers here, if the main function did the same.
  *
+ * Note that localloc cannot be used in a funclet.
+ *
  *     ; After this header, fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested filters.
  *     ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet epilog.
  *
@@ -880,7 +776,9 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
 {
 #ifdef DEBUG
     if (verbose)
+    {
         printf("*************** In genFuncletProlog()\n");
+    }
 #endif
 
     assert(block != NULL);
@@ -910,42 +808,39 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
         maskArgRegsLiveIn = RBM_A0;
     }
 
-    regMaskTP maskSaveRegs  = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
-    int       regsSavedSize = (compiler->compCalleeRegsPushed - 2) << 3;
+    regMaskTP maskSaveRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
+    int       FP_offset    = genFuncletInfo.fiSP_to_CalleeSaved_delta;
 
-    int SP_to_CalleeSaved_delta = genFuncletInfo.fiSP_to_CalleeSaved_delta;
-    if ((SP_to_CalleeSaved_delta + regsSavedSize + genFuncletInfo.fiCalleeSavedPadding) <= 2040)
+    if ((FP_offset + (genCountBits(maskSaveRegs) << 3)) <= (2040 - 16)) // no FP/RA.
     {
-        SP_to_CalleeSaved_delta += genFuncletInfo.fiCalleeSavedPadding;
-
         genStackPointerAdjustment(frameSize, REG_R21, nullptr, /* reportUnwindData */ true);
 
-        genSaveCalleeSavedRegistersHelp(maskSaveRegs, SP_to_CalleeSaved_delta, 0);
-        SP_to_CalleeSaved_delta += regsSavedSize;
+        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+        compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, SP_to_CalleeSaved_delta);
-        compiler->unwindSaveReg(REG_RA, SP_to_CalleeSaved_delta);
+        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+        compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, SP_to_CalleeSaved_delta + 8);
-        compiler->unwindSaveReg(REG_FP, SP_to_CalleeSaved_delta + 8);
+        genSaveCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
     }
     else
     {
         assert(frameSize < -2040);
 
-        int SP_delta = frameSize + SP_to_CalleeSaved_delta;
-        genStackPointerAdjustment(SP_delta, REG_R21, nullptr, /* reportUnwindData */ true);
+        genStackPointerAdjustment(frameSize + (FP_offset & -16), REG_R21, nullptr, true);
+
+        frameSize = -(FP_offset & -16);
+        FP_offset &= 0xf;
 
-        genSaveCalleeSavedRegistersHelp(maskSaveRegs, genFuncletInfo.fiCalleeSavedPadding, 0);
-        regsSavedSize += genFuncletInfo.fiCalleeSavedPadding;
+        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+        compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, regsSavedSize);
-        compiler->unwindSaveReg(REG_RA, regsSavedSize);
+        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+        compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, regsSavedSize + 8);
-        compiler->unwindSaveReg(REG_FP, regsSavedSize + 8);
+        genSaveCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
 
-        genStackPointerAdjustment(-SP_to_CalleeSaved_delta, REG_R21, nullptr, /* reportUnwindData */ true);
+        genStackPointerAdjustment(frameSize, REG_R21, nullptr, true);
     }
 
     // This is the end of the OS-reported prolog for purposes of unwinding
@@ -1012,41 +907,28 @@ void CodeGen::genFuncletEpilog()
     int frameSize = genFuncletInfo.fiSpDelta;
     assert(frameSize < 0);
 
-    regMaskTP regsToRestoreMask = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
-    int       regsRestoreSize   = (compiler->compCalleeRegsPushed - 2) << 3;
+    regMaskTP maskSaveRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
+    int       FP_offset    = genFuncletInfo.fiSP_to_CalleeSaved_delta;
 
-    int SP_to_CalleeSaved_delta = genFuncletInfo.fiSP_to_CalleeSaved_delta;
-    if ((SP_to_CalleeSaved_delta + regsRestoreSize + genFuncletInfo.fiCalleeSavedPadding) <= 2040)
+    if ((FP_offset + (genCountBits(maskSaveRegs) << 3)) > (2040 - 16)) // no FP/RA.
     {
-        SP_to_CalleeSaved_delta += genFuncletInfo.fiCalleeSavedPadding;
-        genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, SP_to_CalleeSaved_delta, 0);
-        SP_to_CalleeSaved_delta += regsRestoreSize;
-
-        GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_RA, REG_SPBASE, SP_to_CalleeSaved_delta);
-        compiler->unwindSaveReg(REG_RA, SP_to_CalleeSaved_delta);
+        assert(frameSize < -2040);
 
-        GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_FP, REG_SPBASE, SP_to_CalleeSaved_delta + 8);
-        compiler->unwindSaveReg(REG_FP, SP_to_CalleeSaved_delta + 8);
+        genStackPointerAdjustment(FP_offset & -16, REG_R21, nullptr, /* reportUnwindData */ true);
 
-        genStackPointerAdjustment(-frameSize, REG_R21, nullptr, /* reportUnwindData */ true);
+        frameSize += FP_offset & -16;
+        FP_offset = FP_offset & 0xf;
     }
-    else
-    {
-        assert(frameSize < -2040);
 
-        genStackPointerAdjustment(SP_to_CalleeSaved_delta, REG_R21, nullptr, /* reportUnwindData */ true);
+    genRestoreCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
 
-        genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, genFuncletInfo.fiCalleeSavedPadding, 0);
-        regsRestoreSize += genFuncletInfo.fiCalleeSavedPadding;
+    GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_RA, REG_SPBASE, regsRestoreSize);
-        compiler->unwindSaveReg(REG_RA, regsRestoreSize);
+    GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_FP, REG_SPBASE, regsRestoreSize + 8);
-        compiler->unwindSaveReg(REG_FP, regsRestoreSize + 8);
-
-        genStackPointerAdjustment(-frameSize - SP_to_CalleeSaved_delta, REG_R21, nullptr, /* reportUnwindData */ true);
-    }
+    genStackPointerAdjustment(-frameSize, REG_R21, nullptr, /* reportUnwindData */ true);
 
     GetEmitter()->emitIns_R_R_I(INS_jirl, emitActualTypeSize(TYP_I_IMPL), REG_R0, REG_RA, 0);
     compiler->unwindReturn(REG_RA);
@@ -1072,7 +954,6 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo()
     }
 
     assert(isFramePointerUsed());
-
     // The frame size and offsets must be finalized
     assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT);
 
@@ -1080,58 +961,40 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo()
     assert((rsMaskSaveRegs & RBM_RA) != 0);
     assert((rsMaskSaveRegs & RBM_FP) != 0);
 
-    unsigned PSPSize = (compiler->lvaPSPSym != BAD_VAR_NUM) ? 8 : 0;
-
     // Because a method and funclets must have the same caller-relative PSPSym offset,
     // if there is a PSPSym, we have to pad the funclet frame size for OSR.
     //
-    unsigned osrPad = 0;
-    if (compiler->opts.IsOSR() && (PSPSize > 0))
+    int osrPad = 0;
+    if (compiler->opts.IsOSR())
     {
-        osrPad = compiler->info.compPatchpointInfo->TotalFrameSize();
+        osrPad -= compiler->info.compPatchpointInfo->TotalFrameSize();
 
         // OSR pad must be already aligned to stack size.
         assert((osrPad % STACK_ALIGN) == 0);
     }
 
-    genFuncletInfo.fiCalleeSavedPadding            = 0;
-    genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() - osrPad;
-
-    unsigned regsSavedSize = genCountBits(rsMaskSaveRegs) << 3;
-    assert(genCountBits(rsMaskSaveRegs) == compiler->compCalleeRegsPushed);
+    /* Now save it for future use */
+    genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() + osrPad;
 
-    unsigned saveRegsPlusPSPSize = regsSavedSize + PSPSize;
+    int funcletFrameSize = compiler->lvaOutgoingArgSpaceSize;
 
-    assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0);
-    unsigned outgoingArgSpaceAligned = roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN);
+    genFuncletInfo.fiSP_to_CalleeSaved_delta = funcletFrameSize;
 
-    unsigned funcletFrameSize        = osrPad + saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize;
-    unsigned funcletFrameSizeAligned = roundUp(funcletFrameSize, STACK_ALIGN);
+    funcletFrameSize += genCountBits(rsMaskSaveRegs) * REGSIZE_BYTES;
 
-    int SP_to_CalleeSaved_delta = compiler->lvaOutgoingArgSpaceSize;
-    if ((SP_to_CalleeSaved_delta + regsSavedSize) >= 2040)
+    int delta_PSP = -TARGET_POINTER_SIZE;
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
     {
-        int offset              = funcletFrameSizeAligned - SP_to_CalleeSaved_delta;
-        SP_to_CalleeSaved_delta = AlignUp((UINT)offset, STACK_ALIGN);
-
-        genFuncletInfo.fiCalleeSavedPadding = SP_to_CalleeSaved_delta - offset;
+        delta_PSP -= TARGET_POINTER_SIZE;
     }
 
-    if (compiler->lvaMonAcquired != BAD_VAR_NUM && !compiler->opts.IsOSR())
-    {
-        // We furthermore allocate the "monitor acquired" bool between PSP and
-        // the saved registers because this is part of the EnC header.
-        // Note that OSR methods reuse the monitor bool created by tier 0.
-        osrPad += compiler->lvaLclSize(compiler->lvaMonAcquired);
-    }
+    funcletFrameSize = funcletFrameSize - delta_PSP - osrPad;
+    funcletFrameSize = roundUp((unsigned)funcletFrameSize, STACK_ALIGN);
 
-    /* Now save it for future use */
-    genFuncletInfo.fiSpDelta                 = -(int)funcletFrameSizeAligned;
-    genFuncletInfo.fiSaveRegs                = rsMaskSaveRegs;
-    genFuncletInfo.fiSP_to_CalleeSaved_delta = SP_to_CalleeSaved_delta;
-
-    genFuncletInfo.fiSP_to_PSP_slot_delta       = funcletFrameSizeAligned - osrPad - 8;
-    genFuncletInfo.fiCallerSP_to_PSP_slot_delta = -(int)osrPad - 8;
+    genFuncletInfo.fiSpDelta                    = -funcletFrameSize;
+    genFuncletInfo.fiSaveRegs                   = rsMaskSaveRegs;
+    genFuncletInfo.fiSP_to_PSP_slot_delta       = funcletFrameSize + delta_PSP + osrPad;
+    genFuncletInfo.fiCallerSP_to_PSP_slot_delta = osrPad + delta_PSP;
 
 #ifdef DEBUG
     if (verbose)
@@ -4278,9 +4141,17 @@ void CodeGen::genCodeForJumpCompare(GenTreeOpCC* tree)
 int CodeGenInterface::genSPtoFPdelta() const
 {
     assert(isFramePointerUsed());
-    assert(compiler->compCalleeRegsPushed >= 2);
+    assert(compiler->compCalleeRegsPushed >= 2); // always FP/RA.
 
-    int delta = compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3) - 8;
+    int delta = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        delta -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        delta -= TARGET_POINTER_SIZE;
+    }
 
     assert(delta >= 0);
     return delta;
@@ -7660,8 +7531,8 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  *      ...
  *      st.d s8,sp,off2+8*8
  *
- *      st.d ra,sp,off3
- *      st.d fp,sp,off3+8
+ *      st.d ra,sp,off3+8
+ *      st.d fp,sp,off3
  *
  * Notes:
  * 1. FP is always saved, and the first store is FP, RA.
@@ -7669,37 +7540,41 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  * 3. For frames with varargs, not implemented completely and not tested !
  * 4. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc).
  *
- * For functions with GS and localloc, we change the frame so the frame pointer and RA are saved at the top
- * of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same
- * rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP.
+ * For functions with GS and localloc, we had saved the frame pointer and RA at the top
+ * of the frame. Note that the funclet frames must follow the same rule,
+ * and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP.
  * Since this frame type is relatively rare, we force using it via stress modes, for additional coverage.
  *
  * The frames look like the following (simplified to only include components that matter for establishing the
  * frames). See also Compiler::lvaAssignFrameOffsets().
  *
- *
  * The LoongArch64's frame layout is liking:
  *
+ *    If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address
+ *    (FP and RA) are protected from buffer overrun by the GS cookie.
+ *    So we always save the FP/RA along with the rest of the callee-saved registers above.
+ *
  *      |                       |
  *      |-----------------------|
  *      |  incoming arguments   |
  *      +=======================+ <---- Caller's SP
- *      |     Arguments  Or     | // if needed.
  *      |  Varargs regs space   | // Only for varargs functions; (varargs not implemented for LoongArch64)
  *      |-----------------------|
  *      |    MonitorAcquired    | // 8 bytes; for synchronized methods
  *      |-----------------------|
- *      |        PSP slot       | // 8 bytes (omitted in NativeAOT ABI)
+ *      |        PSPSym         | // 8 bytes, Only for frames with EH, (omitted in NativeAOT ABI)
  *      |-----------------------|
- *      | locals, temps, etc.   |
+ *      |Callee saved registers | // not including FP/RA; multiple of 8 bytes
  *      |-----------------------|
- *      |  possible GS cookie   |
+ *      |      Saved RA         | // 8 bytes
  *      |-----------------------|
  *      |      Saved FP         | // 8 bytes
  *      |-----------------------|
- *      |      Saved RA         | // 8 bytes
+ *      |  possible GS cookie   |
  *      |-----------------------|
- *      |Callee saved registers | // not including FP/RA; multiple of 8 bytes
+ *      | locals, temps, etc.   |
+ *      |-----------------------|
+ *      |  possible GS cookie   |
  *      |-----------------------|
  *      |   Outgoing arg space  | // multiple of 8 bytes; if required (i.e., #outsz != 0)
  *      |-----------------------| <---- Ambient SP
@@ -7748,6 +7623,9 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
     regSet.rsMaskCalleeSaved = rsPushRegs | RBM_FPBASE | RBM_RA;
 
 #ifdef DEBUG
+    JITDUMP("Frame info. #outsz=%d; #framesz=%d; LclFrameSize=%d;\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
+            genTotalFrameSize(), compiler->compLclFrameSize);
+
     if (compiler->compCalleeRegsPushed != genCountBits(regSet.rsMaskCalleeSaved))
     {
         printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ",
@@ -7770,84 +7648,52 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
     }
 #endif // DEBUG
 
-    // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we
-    // generate based on various sizes.
-    int frameType = 0;
-
-    // The amount to add from SP before starting to store the callee-saved registers.
-    int calleeSaveSPDelta = 0;
-
-    // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address
-    // (FP and RA) are protected from buffer overrun by the GS cookie. If FP/RA are at the lowest addresses,
-    // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will
-    // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our
-    // saved FP/RA. In that case, we save FP/RA along with the rest of the callee-saved registers, above
-    // the GS cookie.
-    //
-    // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to
-    // create a frame pointer chain.
-    //
+    int totalFrameSize = genTotalFrameSize();
+    int leftFrameSize  = 0;
+    int localFrameSize = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
 
-    // This will be the starting place for saving the callee-saved registers, in increasing order.
-    int offset = compiler->lvaOutgoingArgSpaceSize;
+#ifdef DEBUG
+    if (compiler->opts.disAsm)
+    {
+        printf("Frame info. #outsz=%d; #framesz=%d; lcl=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
+               genTotalFrameSize(), localFrameSize);
+    }
+#endif
 
-    int totalFrameSize = genTotalFrameSize();
-    // The (totalFrameSize <= 2040) condition ensures the offsets of st.d/ld.d.
+    int FP_offset = localFrameSize;
     if (totalFrameSize <= 2040)
     {
         GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -totalFrameSize);
         compiler->unwindAllocStack(totalFrameSize);
-
-        // Case #1.
-        //
-        // Generate:
-        //      addi.d sp, sp, -framesz
-        //      st.d callee_saved_registers   ### not including the fp and ra.
-        //      st.d ra,sp,outsz
-        //      st.d fp,sp,outsz+8
-        //
-        // After saving callee-saved registers, ra and fp, we establish the frame pointer with:
-        //      addi.d fp, sp, (the offset of saving fp)
-        // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match.
-
-        JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, compiler->compLclFrameSize);
-
-        frameType = 1;
     }
     else
     {
-        JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, compiler->compLclFrameSize);
-
-        frameType = 2;
-
-        if ((offset + (compiler->compCalleeRegsPushed << 3)) >= 2040)
-        {
-            offset            = totalFrameSize - compiler->lvaOutgoingArgSpaceSize;
-            calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN);
-            offset            = calleeSaveSPDelta - offset;
-
-            genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true);
-        }
-        else
+        if ((localFrameSize + (compiler->compCalleeRegsPushed << 3)) > 2040)
         {
-            genStackPointerAdjustment(-totalFrameSize, initReg, pInitRegZeroed, /* reportUnwindData */ true);
+            leftFrameSize  = localFrameSize & -16;
+            totalFrameSize = totalFrameSize - (localFrameSize & -16);
+            FP_offset      = localFrameSize & 0xf;
         }
+        genStackPointerAdjustment(-totalFrameSize, initReg, pInitRegZeroed, /* reportUnwindData */ true);
     }
+    GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
-    JITDUMP("    offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta);
-    genSaveCalleeSavedRegistersHelp(rsPushRegs, offset, 0);
-    offset += (int)(genCountBits(rsPushRegs) << 3);
+    GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-    GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_RA, REG_SPBASE, offset);
-    compiler->unwindSaveReg(REG_RA, offset);
+    genSaveCalleeSavedRegistersHelp(rsPushRegs, FP_offset + 16, 0);
 
-    GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_FP, REG_SPBASE, offset + 8);
-    compiler->unwindSaveReg(REG_FP, offset + 8);
-
-    JITDUMP("    offsetSpToSavedFp=%d\n", offset + 8);
-    genEstablishFramePointer(offset + 8, /* reportUnwindData */ true);
+    JITDUMP("    offsetSpToSavedFp=%d\n", FP_offset);
+    genEstablishFramePointer(FP_offset, /* reportUnwindData */ true);
 
     // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here,
     // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't
@@ -7858,19 +7704,9 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
         NYI_LOONGARCH64("genPushCalleeSavedRegisters unsupports compIsVarArgs");
     }
 
-#ifdef DEBUG
-    if (compiler->opts.disAsm)
-    {
-        assert(frameType != 0);
-        printf("DEBUG: LOONGARCH64, frameType:%d\n\n", frameType);
-    }
-#endif
-
-    if (calleeSaveSPDelta != 0)
+    if (leftFrameSize != 0)
     {
-        assert(frameType == 2);
-        calleeSaveSPDelta = totalFrameSize - calleeSaveSPDelta;
-        genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true);
+        genStackPointerAdjustment(-leftFrameSize, initReg, pInitRegZeroed, /* reportUnwindData */ true);
     }
 }
 
@@ -7882,85 +7718,78 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
 
     assert(isFramePointerUsed());
 
-    // This will be the starting place for restoring the callee-saved registers, in decreasing order.
-    int calleeSaveSPOffset = 0;
-    int remainingSPSize    = 0;
-
     int totalFrameSize = genTotalFrameSize();
-    if (totalFrameSize <= 2040)
+    int localFrameSize = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
     {
-        JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; localloc? %s\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, dspBool(compiler->compLocallocUsed));
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
 
+    JITDUMP("Frame type. #outsz=%d; #framesz=%d; #calleeSaveRegsPushed:%d; "
+            "localloc? %s\n",
+            unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed,
+            dspBool(compiler->compLocallocUsed));
+
+    emitter* emit            = GetEmitter();
+    int      FP_offset       = localFrameSize;
+    int      remainingSPSize = totalFrameSize;
+    if (totalFrameSize <= 2040)
+    {
         if (compiler->compLocallocUsed)
         {
-            int SPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize;
-
+            int SPtoFPdelta = genSPtoFPdelta();
             // Restore sp from fp
-            GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
+            emit->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
             compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta);
         }
-        calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize;
-        remainingSPSize    = totalFrameSize;
     }
     else
     {
-        JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; #calleeSaveRegsPushed:%d; "
-                "localloc? %s\n",
-                unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed,
-                dspBool(compiler->compLocallocUsed));
-
-        if ((compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3)) > 2047)
+        if (compiler->compLocallocUsed)
         {
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize & -16;
-            if (compiler->compLocallocUsed)
+            int SPtoFPdelta = genSPtoFPdelta();
+            // Restore sp from fp
+            if (emitter::isValidSimm12(SPtoFPdelta))
             {
-                int SPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8;
-
-                // Restore sp from fp
-                GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
-                compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta);
+                emit->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
             }
             else
             {
-                genStackPointerAdjustment(calleeSaveSPOffset, REG_RA, nullptr, /* reportUnwindData */ true);
+                emit->emitIns_I_la(EA_PTRSIZE, REG_RA, SPtoFPdelta);
+                emit->emitIns_R_R_R(INS_sub_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, REG_RA);
             }
-            remainingSPSize    = totalFrameSize - calleeSaveSPOffset;
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize - calleeSaveSPOffset;
         }
-        else
+        if ((localFrameSize + (compiler->compCalleeRegsPushed << 3)) > 2040)
         {
-            if (compiler->compLocallocUsed)
-            {
-                int SPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize;
+            remainingSPSize = localFrameSize & -16;
+            genStackPointerAdjustment(remainingSPSize, REG_RA, nullptr, /* reportUnwindData */ true);
 
-                // Restore sp from fp
-                GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
-                compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta);
-            }
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize;
-            remainingSPSize    = totalFrameSize;
+            remainingSPSize = totalFrameSize - remainingSPSize;
+            FP_offset       = localFrameSize & 0xf;
         }
     }
 
-    JITDUMP("    calleeSaveSPOffset=%d\n", calleeSaveSPOffset);
-    genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, 0);
-    calleeSaveSPOffset += (compiler->compCalleeRegsPushed - 2) << 3;
+    JITDUMP("    calleeSaveSPOffset=%d\n", FP_offset + 16);
+    genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, FP_offset + 16, 0);
 
-    GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSaveSPOffset);
-    compiler->unwindSaveReg(REG_RA, calleeSaveSPOffset);
+    emit->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-    GetEmitter()->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSaveSPOffset + 8);
-    compiler->unwindSaveReg(REG_FP, calleeSaveSPOffset + 8);
+    emit->emitIns_R_R_I(INS_ld_d, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
     if (emitter::isValidUimm11(remainingSPSize))
     {
-        GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, remainingSPSize);
+        emit->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, remainingSPSize);
     }
     else
     {
-        GetEmitter()->emitIns_I_la(EA_PTRSIZE, REG_R21, remainingSPSize);
-        GetEmitter()->emitIns_R_R_R(INS_add_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, REG_R21);
+        emit->emitIns_I_la(EA_PTRSIZE, REG_R21, remainingSPSize);
+        emit->emitIns_R_R_R(INS_add_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, REG_R21);
     }
     compiler->unwindAllocStack(remainingSPSize);
 
@@ -7972,12 +7801,12 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
 
         if (emitter::isValidUimm11(tier0FrameSize))
         {
-            GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tier0FrameSize);
+            emit->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tier0FrameSize);
         }
         else
         {
-            GetEmitter()->emitIns_I_la(EA_PTRSIZE, REG_R21, tier0FrameSize);
-            GetEmitter()->emitIns_R_R_R(INS_add_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, REG_R21);
+            emit->emitIns_I_la(EA_PTRSIZE, REG_R21, tier0FrameSize);
+            emit->emitIns_R_R_R(INS_add_d, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, REG_R21);
         }
         compiler->unwindAllocStack(tier0FrameSize);
     }
diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp
index 546ba7b3180899..bfd79cb04e5de8 100644
--- a/src/coreclr/jit/codegenriscv64.cpp
+++ b/src/coreclr/jit/codegenriscv64.cpp
@@ -770,8 +770,8 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *     addi sp, sp, -#framesz    ; establish the frame
  *     sd s1, #outsz(sp)         ; save callee-saved registers, as necessary
  *     sd s2, #(outsz+8)(sp)
- *     sd ra, #(outsz+?)(sp)     ; save RA (8 bytes)
- *     sd fp, #(outsz+?+8)(sp)   ; save FP (8 bytes)
+ *     sd ra, #(outsz+?+8)(sp)   ; save RA (8 bytes)
+ *     sd fp, #(outsz+?)(sp)     ; save FP (8 bytes)
  *
  *  The funclet frame layout:
  *
@@ -779,8 +779,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |-----------------------|
  *      |  incoming arguments   |
  *      +=======================+ <---- Caller's SP
- *      |     Arguments  Or     | // if needed
- *      |  Varargs regs space   | // Only for varargs functions; NYI on RV64
+ *      |  Varargs regs space   | // Only for varargs main functions; not used for RV64.
  *      |-----------------------|
  *      |    MonitorAcquired    | // 8 bytes; for synchronized methods
  *      |-----------------------|
@@ -788,11 +787,9 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |-----------------------|
  *      ~  alignment padding    ~ // To make the whole frame 16 byte aligned
  *      |-----------------------|
- *      |      Saved FP         | // 8 bytes
+ *      |Callee saved registers | // multiple of 8 bytes, not includting FP/RA
  *      |-----------------------|
- *      |      Saved RA         | // 8 bytes
- *      |-----------------------|
- *      |Callee saved registers | // multiple of 8 bytes, not includting RA/FP
+ *      |      Saved FP, RA     | // 16 bytes
  *      |-----------------------|
  *      |   Outgoing arg space  | // multiple of 8 bytes; if required (i.e., #outsz != 0)
  *      |-----------------------| <---- Ambient SP
@@ -801,31 +798,27 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |       | downward      |
  *              V
  *
- * Note, that SP only change once. That means, there will be a maximum of one alignment slot needed.
- * Also remember, the stack oiubter needs to be 16 byte aligned at all times.
- * The size of the PSP slot plus callee-saved registers space is a maximum of 280 bytes:
- *
- *     RA,FP registers
- *     11 int callee-saved register s1-s11
- *     12 float callee-saved registers f8-f9, f18-f27
- *     8 saved integer argument registers a0-a7, if varargs function support.
- *     1 PSP slot
- *     1 alignment slot or monitor acquired slot
- *     == 35 slots * 8 bytes = 280 bytes.
  *
  * The outgoing argument size, however, can be very large, if we call a function that takes a large number of
  * arguments (note that we currently use the same outgoing argument space size in the funclet as for the main
  * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of
- * outgoing arguments for any call). In that case, we need to 16-byte align the initial change to SP, before
- * saving off the callee-saved registers and establishing the PSPsym, so we can use the limited immediate offset
- * encodings we have available, before doing another 16-byte aligned SP adjustment to create the outgoing argument
- * space. Both changes to SP might need to add alignment padding.
+ * outgoing arguments for any call).
+ *
+ * Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP,
+ * and that location is the same relative to Caller-SP as in the main function where higher than
+ * the callee-saved registers.
+ * That is to say, the PSPSym's relative offset to Caller-SP is not depended on the callee-saved registers.
+ *
+ * Funclets do not have varargs arguments. However, because the PSPSym must exist at the same offset from Caller-SP as in the main function, we
+ * must add buffer space for the saved varargs/argument registers here, if the main function did the same.
+ *
+ * Note that localloc cannot be used in a funclet.
  *
  *  An example epilog sequence:
  *     addi sp, sp, #outsz       ; if any outgoing argument space
  *     ld s1, #(xxx-8)(sp)       ; restore callee-saved registers
  *     ld s2, #xxx(sp)
- *     ld ra, #(xxx+?-8)(sp)     ; restore RA
+ *     ld ra, #(xxx+?+8)(sp)     ; restore RA
  *     ld fp, #(xxx+?)(sp)       ; restore FP
  *     addi sp, sp, #framesz
  *     jarl zero, ra
@@ -840,8 +833,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
         printf("*************** In genFuncletProlog()\n");
     }
 #endif
+
     // TODO-RISCV64: Implement varargs (NYI_RISCV64)
-    // TODO-RISCV64-CQ: We can use C extension for optimization
 
     assert(block != NULL);
     assert(block->HasFlag(BBF_FUNCLET_BEG));
@@ -852,9 +845,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
 
     compiler->unwindBegProlog();
 
-    const bool isFilter  = (block->bbCatchTyp == BBCT_FILTER);
-    const int  frameSize = genFuncletInfo.fiSpDelta;
-
+    bool isFilter  = (block->bbCatchTyp == BBCT_FILTER);
+    int  frameSize = genFuncletInfo.fiSpDelta;
     assert(frameSize < 0);
 
     regMaskTP maskArgRegsLiveIn;
@@ -871,53 +863,39 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
         maskArgRegsLiveIn = RBM_A0;
     }
 
-    regMaskTP maskSaveRegs  = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
-    int       regsSavedSize = (compiler->compCalleeRegsPushed - 2) << 3;
-
-    int calleeSavedDelta = genFuncletInfo.fiSP_to_CalleeSaved_delta;
-
-    emitter* emit = GetEmitter();
+    regMaskTP maskSaveRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
+    int       FP_offset    = genFuncletInfo.fiSP_to_CalleeSaved_delta;
 
-    if (calleeSavedDelta + regsSavedSize + genFuncletInfo.fiCalleeSavedPadding <= 2040)
+    if ((FP_offset + (genCountBits(maskSaveRegs) << 3)) <= (2040 - 16)) // no FP/RA.
     {
-        calleeSavedDelta += genFuncletInfo.fiCalleeSavedPadding;
-
-        // addi sp, sp, #frameSize
         genStackPointerAdjustment(frameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
 
-        genSaveCalleeSavedRegistersHelp(maskSaveRegs, calleeSavedDelta, 0);
-        calleeSavedDelta += regsSavedSize;
+        GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+        compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        // sd ra, #calleeSavedDelta(sp)
-        emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSavedDelta);
-        compiler->unwindSaveReg(REG_RA, calleeSavedDelta);
+        GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+        compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        // sd fp, #(calleeSavedDelta+8)(sp)
-        emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSavedDelta + 8);
-        compiler->unwindSaveReg(REG_FP, calleeSavedDelta + 8);
+        genSaveCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
     }
     else
     {
         assert(frameSize < -2040);
 
-        int spDelta = frameSize + calleeSavedDelta;
+        genStackPointerAdjustment(frameSize + (FP_offset & -16), REG_SCRATCH, nullptr, true);
 
-        // addi sp, sp, #spDelta
-        genStackPointerAdjustment(spDelta, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
+        frameSize = -(FP_offset & -16);
+        FP_offset &= 0xf;
 
-        genSaveCalleeSavedRegistersHelp(maskSaveRegs, genFuncletInfo.fiCalleeSavedPadding, 0);
-        regsSavedSize += genFuncletInfo.fiCalleeSavedPadding;
+        GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+        compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        // sd ra, #regsSavedSize(sp)
-        emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, regsSavedSize);
-        compiler->unwindSaveReg(REG_RA, regsSavedSize);
+        GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+        compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        // sd fp, #(regsSavedSize+8)(sp)
-        emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, regsSavedSize + 8);
-        compiler->unwindSaveReg(REG_FP, regsSavedSize + 8);
+        genSaveCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
 
-        // addi sp, sp -#calleeSavedDelta
-        genStackPointerAdjustment(-calleeSavedDelta, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
+        genStackPointerAdjustment(frameSize, REG_SCRATCH, nullptr, true);
     }
 
     // This is the end of the OS-reported prolog for purposes of unwinding
@@ -977,66 +955,38 @@ void CodeGen::genFuncletEpilog()
         printf("*************** In genFuncletEpilog()\n");
     }
 #endif
-    // TODO-RISCV64: Implement varargs (NYI_RISCV64)
-    // TODO-RISCV64-CQ: We can use C extension for optimization
 
     ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
 
     compiler->unwindBegEpilog();
 
-    const int frameSize = genFuncletInfo.fiSpDelta;
-
+    int frameSize = genFuncletInfo.fiSpDelta;
     assert(frameSize < 0);
 
-    regMaskTP maskRestoreRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
-    int       regsRestoreSize = (compiler->compCalleeRegsPushed - 2) << 3;
-
-    int calleeSavedDelta = genFuncletInfo.fiSP_to_CalleeSaved_delta;
+    regMaskTP maskSaveRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED;
+    int       FP_offset    = genFuncletInfo.fiSP_to_CalleeSaved_delta;
 
-    emitter*  emit    = GetEmitter();
-    regNumber tempReg = rsGetRsvdReg();
-
-    if (calleeSavedDelta + regsRestoreSize + genFuncletInfo.fiCalleeSavedPadding <= 2040)
+    if ((FP_offset + (genCountBits(maskSaveRegs) << 3)) > (2040 - 16)) // no FP/RA.
     {
-        calleeSavedDelta += genFuncletInfo.fiCalleeSavedPadding;
-        genRestoreCalleeSavedRegistersHelp(maskRestoreRegs, calleeSavedDelta, 0);
-        calleeSavedDelta += regsRestoreSize;
-
-        // ld ra, #calleeSavedDelta(sp)
-        emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSavedDelta);
-        compiler->unwindSaveReg(REG_RA, calleeSavedDelta);
+        assert(frameSize < -2040);
 
-        // ld fp, #(calleeSavedDelta+8)(sp)
-        emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSavedDelta + 8);
-        compiler->unwindSaveReg(REG_FP, calleeSavedDelta + 8);
+        genStackPointerAdjustment(FP_offset & -16, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
 
-        // addi sp, sp, -#frameSize
-        genStackPointerAdjustment(-frameSize, tempReg, nullptr, /* reportUnwindData */ true);
+        frameSize += FP_offset & -16;
+        FP_offset = FP_offset & 0xf;
     }
-    else
-    {
-        assert(frameSize < -2040);
-
-        // addi sp, sp, #calleeSavedDelta
-        genStackPointerAdjustment(calleeSavedDelta, tempReg, nullptr, /* reportUnwindData */ true);
 
-        genRestoreCalleeSavedRegistersHelp(maskRestoreRegs, genFuncletInfo.fiCalleeSavedPadding, 0);
-        regsRestoreSize += genFuncletInfo.fiCalleeSavedPadding;
+    genRestoreCalleeSavedRegistersHelp(maskSaveRegs, FP_offset + 16, 0);
 
-        // ld ra, #regsRestoreSize(sp)
-        emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, regsRestoreSize);
-        compiler->unwindSaveReg(REG_RA, regsRestoreSize);
+    GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-        // ld fp, #(regsRestoreSize+8)(sp)
-        emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, regsRestoreSize + 8);
-        compiler->unwindSaveReg(REG_FP, regsRestoreSize + 8);
+    GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
-        // addi sp, sp, -#(frameSize + calleeSavedDelta)
-        genStackPointerAdjustment(-(frameSize + calleeSavedDelta), tempReg, nullptr, /* reportUnwindData */ true);
-    }
+    genStackPointerAdjustment(-frameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
 
-    // jarl zero, ra
-    emit->emitIns_R_R_I(INS_jalr, emitActualTypeSize(TYP_I_IMPL), REG_R0, REG_RA, 0);
+    GetEmitter()->emitIns_R_R_I(INS_jalr, emitActualTypeSize(TYP_I_IMPL), REG_R0, REG_RA, 0);
     compiler->unwindReturn(REG_RA);
 
     compiler->unwindEndEpilog();
@@ -1059,7 +1009,6 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo()
     }
 
     assert(isFramePointerUsed());
-
     // The frame size and offsets must be finalized
     assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT);
 
@@ -1067,74 +1016,56 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo()
     assert((rsMaskSaveRegs & RBM_RA) != 0);
     assert((rsMaskSaveRegs & RBM_FP) != 0);
 
-    unsigned pspSize = (compiler->lvaPSPSym != BAD_VAR_NUM) ? 8 : 0;
-
-    // If there is a PSP slot, we have to pad the funclet frame size for OSR.
-    // For more details see CodeGen::genFuncletProlog
+    // Because a method and funclets must have the same caller-relative PSPSym offset,
+    // if there is a PSPSym, we have to pad the funclet frame size for OSR.
     //
-    unsigned osrPad = 0;
-    if (compiler->opts.IsOSR() && (pspSize != 0))
+    int osrPad = 0;
+    if (compiler->opts.IsOSR())
     {
-        osrPad = compiler->info.compPatchpointInfo->TotalFrameSize();
+        osrPad -= compiler->info.compPatchpointInfo->TotalFrameSize();
 
-        // osrPad must be aligned to stackSize
-        assert(osrPad % STACK_ALIGN == 0);
+        // OSR pad must be already aligned to stack size.
+        assert((osrPad % STACK_ALIGN) == 0);
     }
 
-    genFuncletInfo.fiCalleeSavedPadding            = 0;
-    genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() - osrPad;
-
-    unsigned savedRegsSize = genCountBits(rsMaskSaveRegs);
-    assert(savedRegsSize == compiler->compCalleeRegsPushed);
-    savedRegsSize <<= 3;
+    /* Now save it for future use */
+    genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() + osrPad;
 
-    unsigned saveRegsPlusPSPSize = savedRegsSize + pspSize;
+    int funcletFrameSize = compiler->lvaOutgoingArgSpaceSize;
 
-    assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0);
-    unsigned outgoingArgSpaceAligned = roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN);
+    genFuncletInfo.fiSP_to_CalleeSaved_delta = funcletFrameSize;
 
-    unsigned funcletFrameSize        = osrPad + saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize;
-    unsigned funcletFrameSizeAligned = roundUp(funcletFrameSize, STACK_ALIGN);
+    funcletFrameSize += genCountBits(rsMaskSaveRegs) * REGSIZE_BYTES;
 
-    int SP_to_CalleeSaved_delta = compiler->lvaOutgoingArgSpaceSize;
-    if ((SP_to_CalleeSaved_delta + savedRegsSize) >= 2040)
+    int delta_PSP = -TARGET_POINTER_SIZE;
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
     {
-        int offset              = funcletFrameSizeAligned - SP_to_CalleeSaved_delta;
-        SP_to_CalleeSaved_delta = AlignUp((UINT)offset, STACK_ALIGN);
-
-        genFuncletInfo.fiCalleeSavedPadding = SP_to_CalleeSaved_delta - offset;
+        delta_PSP -= TARGET_POINTER_SIZE;
     }
 
-    if (compiler->lvaMonAcquired != BAD_VAR_NUM && !compiler->opts.IsOSR())
-    {
-        // We furthermore allocate the "monitor acquired" bool between PSP and
-        // the saved registers because this is part of the EnC header.
-        // Note that OSR methods reuse the monitor bool created by tier 0.
-        osrPad += compiler->lvaLclSize(compiler->lvaMonAcquired);
-    }
+    funcletFrameSize = funcletFrameSize - delta_PSP - osrPad;
+    funcletFrameSize = roundUp((unsigned)funcletFrameSize, STACK_ALIGN);
 
-    /* Now save it for future use */
-    genFuncletInfo.fiSpDelta                    = -(int)funcletFrameSizeAligned;
+    genFuncletInfo.fiSpDelta                    = -funcletFrameSize;
     genFuncletInfo.fiSaveRegs                   = rsMaskSaveRegs;
-    genFuncletInfo.fiSP_to_CalleeSaved_delta    = SP_to_CalleeSaved_delta;
-    genFuncletInfo.fiSP_to_PSP_slot_delta       = funcletFrameSizeAligned - osrPad - 8;
-    genFuncletInfo.fiCallerSP_to_PSP_slot_delta = -(int)osrPad - 8;
+    genFuncletInfo.fiSP_to_PSP_slot_delta       = funcletFrameSize + delta_PSP + osrPad;
+    genFuncletInfo.fiCallerSP_to_PSP_slot_delta = osrPad + delta_PSP;
 
 #ifdef DEBUG
     if (verbose)
     {
         printf("\n");
         printf("Funclet prolog / epilog info\n");
-        printf("                 Save regs: ");
+        printf("                        Save regs: ");
         dspRegMask(genFuncletInfo.fiSaveRegs);
         printf("\n");
         if (compiler->opts.IsOSR())
         {
-            printf("                           OSR Pad: %d\n", osrPad);
+            printf("                          OSR Pad: %d\n", osrPad);
         }
-        printf("     Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta);
+        printf("    Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta);
         printf("  SP to CalleeSaved location delta: %d\n", genFuncletInfo.fiSP_to_CalleeSaved_delta);
-        printf("                          SP delta: %d\n", genFuncletInfo.fiSpDelta);
+        printf("                       SP delta: %d\n", genFuncletInfo.fiSpDelta);
     }
     assert(genFuncletInfo.fiSP_to_CalleeSaved_delta >= 0);
 
@@ -4224,9 +4155,17 @@ void CodeGen::genCodeForJumpCompare(GenTreeOpCC* tree)
 int CodeGenInterface::genSPtoFPdelta() const
 {
     assert(isFramePointerUsed());
-    assert(compiler->compCalleeRegsPushed >= 2);
+    assert(compiler->compCalleeRegsPushed >= 2); // always FP/RA.
 
-    int delta = compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3) - 8;
+    int delta = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        delta -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        delta -= TARGET_POINTER_SIZE;
+    }
 
     assert(delta >= 0);
     return delta;
@@ -7733,8 +7672,8 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  *      sd s11, #(offset+8*10)(sp)
  *
  *      ; save ra, fp
- *      sd ra, #offset3(sp)         ; save RA (8 bytes)
- *      sd fp, #(offset3+8)(sp)     ; save FP (8 bytes)
+ *      sd ra, #offset3+8(sp)     ; save RA (8 bytes)
+ *      sd fp, #(offset3)(sp)     ; save FP (8 bytes)
  *
  * Notes:
  * 1. FP is always saved, and the first store is FP, RA.
@@ -7742,9 +7681,9 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  * 3. For frames with varargs, not implemented completely and not tested !
  * 4. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc).
  *
- * For functions with GS and localloc, we change the frame so the frame pointer and RA are saved at the top
- * of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same
- * rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP.
+ * For functions with GS and localloc, we had saved the frame pointer and RA at the top
+ * of the frame. Note that the funclet frames must follow the same rule,
+ * and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP.
  * Since this frame type is relatively rare, we force using it via stress modes, for additional coverage.
  *
  * The frames look like the following (simplified to only include components that matter for establishing the
@@ -7752,6 +7691,10 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  *
  * The RISC-V's frame layout is liking:
  *
+ *    If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address
+ *    (FP and RA) are protected from buffer overrun by the GS cookie.
+ *    So we always save the FP/RA along with the rest of the callee-saved registers above.
+ *
  *      |                       |
  *      |-----------------------|
  *      |  incoming arguments   |
@@ -7763,15 +7706,17 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
  *      |-----------------------|
  *      |        PSP slot       | // 8 bytes (omitted in NativeAOT ABI)
  *      |-----------------------|
- *      | locals, temps, etc.   |
- *      |-----------------------|
- *      |  possible GS cookie   |
+ *      |Callee saved registers | // not including FP/RA; multiple of 8 bytes
  *      |-----------------------|
  *      |      Saved FP         | // 8 bytes
  *      |-----------------------|
  *      |      Saved RA         | // 8 bytes
  *      |-----------------------|
- *      |Callee saved registers | // not including FP/RA; multiple of 8 bytes
+ *      |  possible GS cookie   |
+ *      |-----------------------|
+ *      | locals, temps, etc.   |
+ *      |-----------------------|
+ *      |  possible GS cookie   |
  *      |-----------------------|
  *      |   Outgoing arg space  | // multiple of 8 bytes; if required (i.e., #outsz != 0)
  *      |-----------------------| <---- Ambient SP
@@ -7785,12 +7730,6 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
 {
     assert(compiler->compGeneratingProlog);
 
-    // The 'initReg' could have been calculated as one of the callee-saved registers (let's say T0, T1 and T2 are in
-    // use, so the next possible register is S1, which should be callee-save register). This is fine, as long as we
-    // save callee-saved registers before using 'initReg' for the first time. Instead, we can use REG_SCRATCH
-    // beforehand. We don't care if REG_SCRATCH will be overwritten, so we'll skip 'RegZeroed check'.
-    //
-    // Unlike on x86/x64, we can also push float registers to stack
     regMaskTP rsPushRegs = regSet.rsGetModifiedCalleeSavedRegsMask();
 
 #if ETW_EBP_FRAMED
@@ -7800,7 +7739,7 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
     }
 #endif
 
-    // On RV64 we always use the FP (frame-pointer)
+    // We always use the FP (frame-pointer).
     assert(isFramePointerUsed());
 
     //
@@ -7823,25 +7762,25 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
     // is not worth it.
     //
 
-    // we will push callee-saved registers along with fp and ra registers to stack
-    regMaskTP rsPushRegsMask = rsPushRegs | RBM_FP | RBM_RA;
-    regSet.rsMaskCalleeSaved = rsPushRegsMask;
+    regSet.rsMaskCalleeSaved = rsPushRegs | RBM_FPBASE | RBM_RA;
 
 #ifdef DEBUG
-    if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegsMask))
+    JITDUMP("Frame info. #outsz=%d; #framesz=%d; LclFrameSize=%d;\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
+            genTotalFrameSize(), compiler->compLclFrameSize);
+
+    if (compiler->compCalleeRegsPushed != genCountBits(regSet.rsMaskCalleeSaved))
     {
         printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ",
-               compiler->compCalleeRegsPushed, genCountBits(rsPushRegsMask));
-        dspRegMask(rsPushRegsMask);
+               compiler->compCalleeRegsPushed, genCountBits(rsPushRegs | RBM_FPBASE | RBM_RA));
+        dspRegMask(rsPushRegs | RBM_FPBASE | RBM_RA);
         printf("\n");
-        assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegsMask));
+        assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs | RBM_FPBASE | RBM_RA));
     }
 
     if (verbose)
     {
-        regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_FLT_CALLEE_SAVED;
-        regMaskTP maskSaveRegsInt   = rsPushRegs & RBM_INT_CALLEE_SAVED;
-
+        regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT;
+        regMaskTP maskSaveRegsInt   = rsPushRegs & ~maskSaveRegsFloat;
         printf("Save float regs: ");
         dspRegMask(maskSaveRegsFloat);
         printf("\n");
@@ -7851,80 +7790,57 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
     }
 #endif // DEBUG
 
-    // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we
-    // generate based on various sizes.
-    int frameType = 0;
-
-    // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the
-    // first save instruction as a "predecrement" amount, if possible.
-    int calleeSaveSPDelta = 0;
-
-    // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address
-    // (FP and RA) are protected from buffer overrun by the GS cookie. If FP/RA are at the lowest addresses,
-    // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will
-    // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our
-    // saved FP/RA. In that case, we save FP/RA along with the rest of the callee-saved registers, above
-    // the GS cookie.
-    //
-    // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to
-    // create a frame pointer chain.
-    //
-
-    // This will be the starting place for saving the callee-saved registers, in increasing order.
-    int offset = compiler->lvaOutgoingArgSpaceSize;
-
     int totalFrameSize = genTotalFrameSize();
+    int leftFrameSize  = 0;
+    int localFrameSize = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
 
-    emitter* emit = GetEmitter();
+#ifdef DEBUG
+    if (compiler->opts.disAsm)
+    {
+        printf("Frame info. #outsz=%d; #framesz=%d; lcl=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
+               genTotalFrameSize(), localFrameSize);
+    }
+#endif
 
-    // ensure offset of sd/ld
+    int FP_offset = localFrameSize;
     if (totalFrameSize <= 2040)
     {
-        frameType = 1;
-
-        emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -totalFrameSize);
+        GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -totalFrameSize);
         compiler->unwindAllocStack(totalFrameSize);
-
-        JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, compiler->compLclFrameSize);
     }
     else
     {
-        frameType = 2;
-        // we have to adjust stack pointer; probably using add instead of addi
-
-        JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, compiler->compLclFrameSize);
-
-        if ((offset + (compiler->compCalleeRegsPushed << 3)) >= 2040)
-        {
-            offset            = totalFrameSize - compiler->lvaOutgoingArgSpaceSize;
-            calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN);
-            offset            = calleeSaveSPDelta - offset;
-
-            genStackPointerAdjustment(-calleeSaveSPDelta, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
-        }
-        else
+        if ((localFrameSize + (compiler->compCalleeRegsPushed << 3)) > 2040)
         {
-            genStackPointerAdjustment(-totalFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
+            leftFrameSize  = localFrameSize & -16;
+            totalFrameSize = totalFrameSize - (localFrameSize & -16);
+            FP_offset      = localFrameSize & 0xf;
         }
+        // The 'initReg' could have been calculated as one of the callee-saved registers (let's say T0, T1 and T2 are in
+        // use, so the next possible register is S1, which should be callee-save register). This is fine, as long as we
+        // save callee-saved registers before using 'initReg' for the first time. Instead, we can use REG_SCRATCH
+        // beforehand. We don't care if REG_SCRATCH will be overwritten, so we'll skip 'RegZeroed check'.
+        // TODO-RV64: this should be resolved before calling `genPushCalleeSavedRegisters`.
+        genStackPointerAdjustment(-totalFrameSize, REG_SCRATCH, pInitRegZeroed, /* reportUnwindData */ true);
     }
+    GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
-    JITDUMP("    offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta);
-
-    genSaveCalleeSavedRegistersHelp(rsPushRegs, offset, 0);
-    offset += (int)(genCountBits(rsPushRegs) << 3); // each reg has 8 bytes
+    GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-    // From now on, we can safely use initReg.
+    genSaveCalleeSavedRegistersHelp(rsPushRegs, FP_offset + 16, 0);
 
-    emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offset);
-    compiler->unwindSaveReg(REG_RA, offset);
-
-    emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offset + 8);
-    compiler->unwindSaveReg(REG_FP, offset + 8);
-
-    JITDUMP("    offsetSpToSavedFp=%d\n", offset + 8);
-    genEstablishFramePointer(offset + 8, /* reportUnwindData */ true);
+    JITDUMP("    offsetSpToSavedFp=%d\n", FP_offset);
+    genEstablishFramePointer(FP_offset, /* reportUnwindData */ true);
 
     // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here,
     // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't
@@ -7935,18 +7851,9 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
         NYI_RISCV64("genPushCalleeSavedRegisters unsupports compIsVarArgs");
     }
 
-#ifdef DEBUG
-    if (compiler->opts.disAsm)
-    {
-        printf("DEBUG: RISCV64, frameType:%d\n\n", frameType);
-    }
-#endif
-
-    if (calleeSaveSPDelta != 0)
+    if (leftFrameSize != 0)
     {
-        assert(frameType == 2);
-        calleeSaveSPDelta = totalFrameSize - calleeSaveSPDelta;
-        genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true);
+        genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, pInitRegZeroed, /* reportUnwindData */ true);
     }
 }
 
@@ -7956,80 +7863,72 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
 
     regMaskTP regsToRestoreMask = regSet.rsGetModifiedCalleeSavedRegsMask();
 
-    // On RV64 we always use the FP (frame-pointer)
     assert(isFramePointerUsed());
 
-    int totalFrameSize     = genTotalFrameSize();
-    int remainingSPSize    = totalFrameSize;
-    int callerSPtoFPdelta  = 0;
-    int calleeSaveSPOffset = 0; // This will be the starting place for restoring
-                                // the callee-saved registers, in decreasing order.
+    int totalFrameSize = genTotalFrameSize();
+    int localFrameSize = compiler->compLclFrameSize;
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
+    if ((compiler->lvaMonAcquired != BAD_VAR_NUM) && !compiler->opts.IsOSR())
+    {
+        localFrameSize -= TARGET_POINTER_SIZE;
+    }
 
-    emitter* emit = GetEmitter();
+    JITDUMP("Frame type. #outsz=%d; #framesz=%d; #calleeSaveRegsPushed:%d; "
+            "localloc? %s\n",
+            unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed,
+            dspBool(compiler->compLocallocUsed));
 
-    // ensure offset of sd/ld
+    emitter* emit            = GetEmitter();
+    int      FP_offset       = localFrameSize;
+    int      remainingSPSize = totalFrameSize;
     if (totalFrameSize <= 2040)
     {
-        JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; localloc? %s\n", unsigned(compiler->lvaOutgoingArgSpaceSize),
-                totalFrameSize, dspBool(compiler->compLocallocUsed));
-
         if (compiler->compLocallocUsed)
         {
-            callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize;
+            int SPtoFPdelta = genSPtoFPdelta();
+            // Restore sp from fp
+            emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
+            compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta);
         }
-        calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize;
-        // remainingSPSize = totalFrameSize;
     }
     else
     {
-        JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; calleeSaveRegsPushed: %d; localloc? %s\n",
-                unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed,
-                dspBool(compiler->compLocallocUsed));
-
-        if ((compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3)) > 2047)
+        if (compiler->compLocallocUsed)
         {
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize & 0xfffffff0;
-
-            if (compiler->compLocallocUsed)
+            int SPtoFPdelta = genSPtoFPdelta();
+            // Restore sp from fp
+            if (emitter::isValidSimm12(SPtoFPdelta))
             {
-                callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8;
+                emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -SPtoFPdelta);
             }
             else
             {
-                genStackPointerAdjustment(calleeSaveSPOffset, REG_RA, nullptr, /* reportUnwindData */ true);
+                regNumber tempReg = rsGetRsvdReg();
+                emit->emitLoadImmediate(EA_PTRSIZE, tempReg, SPtoFPdelta);
+                emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, tempReg);
             }
-            remainingSPSize    = totalFrameSize - calleeSaveSPOffset;
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize - calleeSaveSPOffset;
         }
-        else
+        if ((localFrameSize + (compiler->compCalleeRegsPushed << 3)) > 2040)
         {
-            if (compiler->compLocallocUsed)
-            {
-                callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize;
-            }
-            calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize;
-            // remainingSPSize = totalFrameSize;
-        }
-    }
+            remainingSPSize = localFrameSize & -16;
+            genStackPointerAdjustment(remainingSPSize, REG_RA, nullptr, /* reportUnwindData */ true);
 
-    if (compiler->compLocallocUsed)
-    {
-        // restore sp form fp: addi sp, -#callerSPtoFPdelta(fp)
-        emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -callerSPtoFPdelta);
-        compiler->unwindSetFrameReg(REG_FPBASE, callerSPtoFPdelta);
+            remainingSPSize = totalFrameSize - remainingSPSize;
+            FP_offset       = localFrameSize & 0xf;
+        }
     }
 
-    JITDUMP("    calleeSaveSPOffset=%d, callerSPtoFPdelta=%d\n", calleeSaveSPOffset, callerSPtoFPdelta);
-    genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, 0);
-
-    // restore ra/fp regs
-    calleeSaveSPOffset += (compiler->compCalleeRegsPushed - 2) << 3;
+    JITDUMP("    calleeSaveSPOffset=%d\n", FP_offset + 16);
+    genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, FP_offset + 16, 0);
 
-    emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSaveSPOffset);
-    compiler->unwindSaveReg(REG_RA, calleeSaveSPOffset);
+    emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, FP_offset + 8);
+    compiler->unwindSaveReg(REG_RA, FP_offset + 8);
 
-    emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSaveSPOffset + 8);
-    compiler->unwindSaveReg(REG_FP, calleeSaveSPOffset + 8);
+    emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
+    compiler->unwindSaveReg(REG_FP, FP_offset);
 
     if (emitter::isValidUimm11(remainingSPSize))
     {
@@ -8043,7 +7942,7 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
     }
     compiler->unwindAllocStack(remainingSPSize);
 
-    // for OSR we have to adjust SP to remove tier0 frame
+    // For OSR, we must also adjust the SP to remove the Tier0 frame.
     if (compiler->opts.IsOSR())
     {
         const int tier0FrameSize = compiler->info.compPatchpointInfo->TotalFrameSize();
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index a32867660a01c3..5abb0c5dcd169e 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -5834,12 +5834,18 @@ void Compiler::generatePatchpointInfo()
     //
     const int totalFrameSize = codeGen->genTotalFrameSize() + TARGET_POINTER_SIZE;
     const int offsetAdjust   = 0;
-#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+#elif defined(TARGET_ARM64)
     // SP is not manipulated by calls so no frame size adjustment needed.
     // Local Offsets may need adjusting, if FP is at bottom of frame.
     //
     const int totalFrameSize = codeGen->genTotalFrameSize();
     const int offsetAdjust   = codeGen->genSPtoFPdelta() - totalFrameSize;
+#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+    // SP is not manipulated by calls so no frame size adjustment needed.
+    // Local Offsets are adjusted which relative to SP.
+    //
+    const int totalFrameSize = codeGen->genTotalFrameSize();
+    const int offsetAdjust   = codeGen->genSPtoFPdelta();
 #else
     NYI("patchpoint info generation");
     const int offsetAdjust   = 0;
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index c101955fdea7ed..935cf233530350 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5669,23 +5669,52 @@ void Compiler::lvaFixVirtualFrameOffsets()
         // We set FP to be after LR, FP
         delta += 2 * REGSIZE_BYTES;
     }
-#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+#elif defined(TARGET_AMD64) || defined(TARGET_ARM64)
     else
     {
         // FP is used.
         JITDUMP("--- delta bump %d for FP frame\n", codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta());
         delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();
     }
-#endif // TARGET_AMD64 || TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64
+#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+    else
+    {
+        // FP is used.
+        delta += (compCalleeRegsPushed << 3);
+
+        if ((lvaMonAcquired != BAD_VAR_NUM) && !opts.IsOSR())
+        {
+            int offset = lvaTable[lvaMonAcquired].GetStackOffset() + delta;
+            lvaTable[lvaMonAcquired].SetStackOffset(offset);
+
+            if (lvaPSPSym != BAD_VAR_NUM)
+            {
+                int offset = lvaTable[lvaPSPSym].GetStackOffset() + delta;
+                lvaTable[lvaPSPSym].SetStackOffset(offset);
+                delta += TARGET_POINTER_SIZE;
+            }
+
+            delta += lvaLclSize(lvaMonAcquired);
+        }
+        else if (lvaPSPSym != BAD_VAR_NUM)
+        {
+            int offset = lvaTable[lvaPSPSym].GetStackOffset() + delta;
+            lvaTable[lvaPSPSym].SetStackOffset(offset);
+            delta += TARGET_POINTER_SIZE;
+        }
+
+        JITDUMP("--- delta bump %d for FP frame\n", delta);
+    }
+#endif // !TARGET_LOONGARCH64 || !TARGET_RISCV64
 
     if (opts.IsOSR())
     {
-#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
         // Stack offset includes Tier0 frame.
         //
         JITDUMP("--- delta bump %d for OSR + Tier0 frame\n", info.compPatchpointInfo->TotalFrameSize());
         delta += info.compPatchpointInfo->TotalFrameSize();
-#endif
+#endif // !TARGET_LOONGARCH64 || !TARGET_RISCV64
     }
 
     JITDUMP("--- virtual stack offset to actual stack offset delta is %d\n", delta);
@@ -5775,26 +5804,20 @@ void Compiler::lvaFixVirtualFrameOffsets()
 
 #endif // FEATURE_FIXED_OUT_ARGS
 
-#if defined(TARGET_ARM64)
+#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
     // We normally add alignment below the locals between them and the outgoing
     // arg space area. When we store fp/lr(ra) at the bottom, however, this will
     // be below the alignment. So we should not apply the alignment adjustment to
     // them. It turns out we always store these at +0 and +8 of the FP,
     // so instead of dealing with skipping adjustment just for them we just set
     // them here always.
+    // For LoongArch64 and RISCV64, the RA is always at fp+8.
     assert(codeGen->isFramePointerUsed());
     if (lvaRetAddrVar != BAD_VAR_NUM)
     {
         lvaTable[lvaRetAddrVar].SetStackOffset(REGSIZE_BYTES);
     }
-#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    assert(codeGen->isFramePointerUsed());
-    if (lvaRetAddrVar != BAD_VAR_NUM)
-    {
-        // For LoongArch64 and RISCV64, the RA is below the fp. see the `genPushCalleeSavedRegisters`
-        lvaTable[lvaRetAddrVar].SetStackOffset(-REGSIZE_BYTES);
-    }
-#endif // !TARGET_LOONGARCH64
+#endif // !TARGET_ARM64 || !TARGET_LOONGARCH64 || !TARGET_RISCV64
 }
 
 #ifdef TARGET_ARM
@@ -6548,9 +6571,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     //
     if (opts.IsOSR())
     {
+#if !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64)
         originalFrameSize    = info.compPatchpointInfo->TotalFrameSize();
         originalFrameStkOffs = stkOffs;
         stkOffs -= originalFrameSize;
+#endif
     }
 
 #ifdef TARGET_XARCH
@@ -6606,7 +6631,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
 #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
 
-    assert(compCalleeRegsPushed >= 2);
+    assert(compCalleeRegsPushed >= 2); // always FP/RA.
+    stkOffs -= (compCalleeRegsPushed << 3);
 
 #else // !TARGET_LOONGARCH64 && !TARGET_RISCV64
 #ifdef TARGET_ARM
@@ -7331,14 +7357,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     }
 #endif // FEATURE_FIXED_OUT_ARGS
 
-#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    // For LoongArch64 and RISCV64, CalleeSavedRegs are at bottom.
-    int pushedCount = 0;
-#else
     // compLclFrameSize equals our negated virtual stack offset minus the pushed registers and return address
     // and the pushed frame pointer register which for some strange reason isn't part of 'compCalleeRegsPushed'.
     int pushedCount = compCalleeRegsPushed;
-#endif
 
 #ifdef TARGET_ARM64
     if (info.compIsVarArgs)
diff --git a/src/coreclr/jit/regset.h b/src/coreclr/jit/regset.h
index dae93baebad306..20b55610594fc6 100644
--- a/src/coreclr/jit/regset.h
+++ b/src/coreclr/jit/regset.h
@@ -158,8 +158,9 @@ class RegSet
     regMaskTP _rsMaskVars; // backing store for rsMaskVars property
 
 #if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+    // TODO: the funclet's callee-saved registers should not shared with main function.
     regMaskTP rsMaskCalleeSaved; // mask of the registers pushed/popped in the prolog/epilog
-#endif                           // TARGET_ARMARCH || TARGET_LOONGARCH64
+#endif                           // TARGET_ARMARCH || TARGET_LOONGARCH64 || TARGET_RISCV64
 
 public:                    // TODO-Cleanup: Should be private, but Compiler uses it
     regMaskTP rsMaskResvd; // mask of the registers that are reserved for special purposes (typically empty)

From a9dfea9274eb0d3b18dc3769f5b7bd30a4fb059c Mon Sep 17 00:00:00 2001
From: Qiao Pengcheng <qiaopengcheng@loongson.cn>
Date: Thu, 18 Apr 2024 19:44:59 +0800
Subject: [PATCH 2/4] update code annotation.

---
 src/coreclr/jit/lclvars.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 935cf233530350..7f9f47ff17f0ec 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5714,7 +5714,7 @@ void Compiler::lvaFixVirtualFrameOffsets()
         //
         JITDUMP("--- delta bump %d for OSR + Tier0 frame\n", info.compPatchpointInfo->TotalFrameSize());
         delta += info.compPatchpointInfo->TotalFrameSize();
-#endif // !TARGET_LOONGARCH64 || !TARGET_RISCV64
+#endif // TARGET_AMD64 || TARGET_ARM64
     }
 
     JITDUMP("--- virtual stack offset to actual stack offset delta is %d\n", delta);

From 964df7b412920d2dd99d4cc1d1348d284beb79f7 Mon Sep 17 00:00:00 2001
From: Qiao Pengcheng <qiaopengcheng@loongson.cn>
Date: Fri, 19 Apr 2024 14:28:26 +0800
Subject: [PATCH 3/4] revert the OSR offset's changing which will be pushed by
 a new PR.

---
 src/coreclr/jit/codegencommon.cpp      | 34 ++------------------------
 src/coreclr/jit/codegenloongarch64.cpp |  2 +-
 src/coreclr/jit/codegenriscv64.cpp     |  2 +-
 src/coreclr/jit/compiler.cpp           |  8 +-----
 src/coreclr/jit/lclvars.cpp            |  4 ++-
 5 files changed, 8 insertions(+), 42 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 98634c890530b2..c8211a7e731440 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -4086,7 +4086,7 @@ void CodeGen::genEnregisterOSRArgsAndLocals()
 
         GetEmitter()->emitIns_R_AR(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset);
 
-#elif defined(TARGET_ARM64)
+#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
 
         // Patchpoint offset is from top of Tier0 frame
         //
@@ -4118,37 +4118,7 @@ void CodeGen::genEnregisterOSRArgsAndLocals()
 
         genInstrWithConstant(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset, initReg);
         *pInitRegZeroed = false;
-#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-
-        // Patchpoint offset is from top of Tier0 frame
-        //
-        // We need to determine the frame-pointer relative
-        // offset for this variable in the osr frame.
-        //
-        // First there is no need to ajust stkOffs
-        // as it relative to sp within Tier0 frame
-
-        // then add the OSR frame size
-        //
-        const int osrFrameSize = genTotalFrameSize();
-
-        // then subtract OSR SP-FP delta
-        //
-        const int osrSpToFpDelta = genSPtoFPdelta();
-
-        //                 | => tier0 top of frame relative
-        //                 |         + => osr bottom of frame (sp) relative
-        //                 |         |              - => osr fp relative
-        //                 |         |              |
-        const int offset = stkOffs + osrFrameSize - osrSpToFpDelta;
-
-        JITDUMP("---OSR--- V%02u (reg) Tier0 virtual offset %d OSR frame size %d OSR sp-fp "
-                "delta %d total offset %d (0x%x)\n",
-                varNum, stkOffs, osrFrameSize, osrSpToFpDelta, offset, offset);
-
-        genInstrWithConstant(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset, initReg);
-        *pInitRegZeroed = false;
-#endif // TARGET_LOONGARCH64 || TARGET_RISCV64
+#endif // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64
     }
 }
 
diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp
index fd5114ce06153e..1bd29a432ce170 100644
--- a/src/coreclr/jit/codegenloongarch64.cpp
+++ b/src/coreclr/jit/codegenloongarch64.cpp
@@ -691,7 +691,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |-----------------------|
  *      |        PSP slot       | // 8 bytes (omitted in NativeAOT ABI)
  *      |-----------------------|
- *      |Callee saved registers | // multiple of 8 bytes, not includting FP/RA
+ *      |Callee saved registers | // multiple of 8 bytes, not including FP/RA
  *      |-----------------------|
  *      |      Saved FP, RA     | // 16 bytes
  *      |-----------------------|
diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp
index bfd79cb04e5de8..203a4480345028 100644
--- a/src/coreclr/jit/codegenriscv64.cpp
+++ b/src/coreclr/jit/codegenriscv64.cpp
@@ -787,7 +787,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
  *      |-----------------------|
  *      ~  alignment padding    ~ // To make the whole frame 16 byte aligned
  *      |-----------------------|
- *      |Callee saved registers | // multiple of 8 bytes, not includting FP/RA
+ *      |Callee saved registers | // multiple of 8 bytes, not including FP/RA
  *      |-----------------------|
  *      |      Saved FP, RA     | // 16 bytes
  *      |-----------------------|
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 5abb0c5dcd169e..a32867660a01c3 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -5834,18 +5834,12 @@ void Compiler::generatePatchpointInfo()
     //
     const int totalFrameSize = codeGen->genTotalFrameSize() + TARGET_POINTER_SIZE;
     const int offsetAdjust   = 0;
-#elif defined(TARGET_ARM64)
+#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
     // SP is not manipulated by calls so no frame size adjustment needed.
     // Local Offsets may need adjusting, if FP is at bottom of frame.
     //
     const int totalFrameSize = codeGen->genTotalFrameSize();
     const int offsetAdjust   = codeGen->genSPtoFPdelta() - totalFrameSize;
-#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    // SP is not manipulated by calls so no frame size adjustment needed.
-    // Local Offsets are adjusted which relative to SP.
-    //
-    const int totalFrameSize = codeGen->genTotalFrameSize();
-    const int offsetAdjust   = codeGen->genSPtoFPdelta();
 #else
     NYI("patchpoint info generation");
     const int offsetAdjust   = 0;
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 7f9f47ff17f0ec..e6645e1f03d8f4 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -6571,7 +6571,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     //
     if (opts.IsOSR())
     {
-#if !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64)
+#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+        originalFrameStkOffs = info.compPatchpointInfo->TotalFrameSize();
+#else
         originalFrameSize    = info.compPatchpointInfo->TotalFrameSize();
         originalFrameStkOffs = stkOffs;
         stkOffs -= originalFrameSize;

From e254dd6ce87f02fc385e118977d8c358a090995b Mon Sep 17 00:00:00 2001
From: Qiao Pengcheng <qiaopengcheng@loongson.cn>
Date: Mon, 22 Apr 2024 15:25:20 +0800
Subject: [PATCH 4/4] amend codegenriscv64.cpp for CRs.

---
 src/coreclr/jit/codegenriscv64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp
index 203a4480345028..b7c119be94987c 100644
--- a/src/coreclr/jit/codegenriscv64.cpp
+++ b/src/coreclr/jit/codegenriscv64.cpp
@@ -7829,7 +7829,7 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
         // save callee-saved registers before using 'initReg' for the first time. Instead, we can use REG_SCRATCH
         // beforehand. We don't care if REG_SCRATCH will be overwritten, so we'll skip 'RegZeroed check'.
         // TODO-RV64: this should be resolved before calling `genPushCalleeSavedRegisters`.
-        genStackPointerAdjustment(-totalFrameSize, REG_SCRATCH, pInitRegZeroed, /* reportUnwindData */ true);
+        genStackPointerAdjustment(-totalFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
     }
     GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, FP_offset);
     compiler->unwindSaveReg(REG_FP, FP_offset);
@@ -7853,7 +7853,7 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe
 
     if (leftFrameSize != 0)
     {
-        genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, pInitRegZeroed, /* reportUnwindData */ true);
+        genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
     }
 }