[AArch64] Async unwind - Fix MTE codegen emitting frame adjustments i…

…n a loop When untagging the stack, the compiler may emit a sequence like: ``` .LBB0_1: st2g sp, [sp], #32 sub x8, x8, #32 cbnz x8, .LBB0_1 stg sp, [sp], #16 ``` These stack adjustments cannot be described by CFI instructions. This patch disables merging of SP update with untagging, i.e. makes the compiler use an additional scratch register (there should be plenty available at this point as we are in the epilogue) and generate: ``` mov x9, sp mov x8, #256 stg x9, [x9], #16 .LBB0_1: sub x8, x8, #32 st2g x9, [x9], #32 cbnz x8, .LBB0_1 add sp, sp, #272 ``` Merging is disabled only when we need to generate asynchronous unwind tables. Reviewed By: eugenis Differential Revision: https://reviews.llvm.org/D114548
swiftlang · Apr 15, 2022 · 24c84bd · 24c84bd
1 parent 5865a74
commit 24c84bd
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 22 deletions.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3235,7 +3235,7 @@ class TagStoreEdit {
   // instructions. May skip if the replacement is not profitable. May invalidate
   // the input iterator and replace it with a valid one.
   void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
+                const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
 };
 
 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
@@ -3374,7 +3374,8 @@ void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
 }
 
 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
+                            const AArch64FrameLowering *TFI,
+                            bool TryMergeSPUpdate) {
   if (TagStores.empty())
     return;
   TagStoreInstr &FirstTagStore = TagStores[0];
@@ -3404,8 +3405,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
     emitUnrolled(InsertI);
   } else {
     MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
+    int64_t TotalOffset = 0;
+    if (TryMergeSPUpdate) {
       // See if we can merge base register update into the STGloop.
       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
       // but STGloop is way too unusual for that, and also it only
@@ -3550,15 +3551,19 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
   for (auto &Instr : Instrs) {
     if (EndOffset && *EndOffset != Instr.Offset) {
       // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+      TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
       TSE.clear();
     }
 
     TSE.addInstruction(Instr);
     EndOffset = Instr.Offset + Instr.Size;
   }
 
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+  // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
+  TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
+               !MBB->getParent()
+                    ->getInfo<AArch64FunctionInfo>()
+                    ->needsAsyncDwarfUnwindInfo());
 
   return InsertI;
 }

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -589,23 +589,31 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 
 // Create a scratch register for the frame index elimination in an instruction.
 // This function has special handling of stack tagging loop pseudos, in which
-// case it can also change the instruction opcode (but not the operands).
+// case it can also change the instruction opcode.
 static Register
-createScratchRegisterForInstruction(MachineInstr &MI,
+createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum,
                                     const AArch64InstrInfo *TII) {
   // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
   // replace the instruction with the writeback variant because it will now
   // satisfy the operand constraints for it.
-  if (MI.getOpcode() == AArch64::STGloop) {
-    MI.setDesc(TII->get(AArch64::STGloop_wback));
-    return MI.getOperand(1).getReg();
-  } else if (MI.getOpcode() == AArch64::STZGloop) {
-    MI.setDesc(TII->get(AArch64::STZGloop_wback));
-    return MI.getOperand(1).getReg();
+  Register ScratchReg;
+  if (MI.getOpcode() == AArch64::STGloop ||
+      MI.getOpcode() == AArch64::STZGloop) {
+    assert(FIOperandNum == 3 &&
+           "Wrong frame index operand for STGloop/STZGloop");
+    unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback
+                                                     : AArch64::STZGloop_wback;
+    ScratchReg = MI.getOperand(1).getReg();
+    MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true);
+    MI.setDesc(TII->get(Op));
+    MI.tieOperands(1, 3);
   } else {
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
+    ScratchReg =
+        MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    MI.getOperand(FIOperandNum)
+        .ChangeToRegister(ScratchReg, false, false, true);
   }
+  return ScratchReg;
 }
 
 void AArch64RegisterInfo::getOffsetOpcodes(
@@ -722,9 +730,9 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
+  Register ScratchReg =
+      createScratchRegisterForInstruction(MI, FIOperandNum, TII);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,

diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
@@ -146,21 +146,44 @@ entry:
   ret void
 }
 
-define void @stg_alloca17() uwtable {
+define void @stg_alloca17() nounwind {
 ; CHECK-LABEL: stg_alloca17:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #288
-; CHECK-NEXT:    .cfi_def_cfa_offset 288
-; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
 ; CHECK-NEXT:  .LBB11_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    st2g sp, [sp], #32
 ; CHECK-NEXT:    sub x8, x8, #32
 ; CHECK-NEXT:    cbnz x8, .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    stg sp, [sp], #16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca i8, i32 272, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 272)
+  ret void
+}
+
+define void @stg_alloca18() uwtable {
+; CHECK-LABEL: stg_alloca18:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #288
+; CHECK-NEXT:    .cfi_def_cfa_offset 288
+; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    stg x9, [x9], #16
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub x8, x8, #32
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    cbnz x8, .LBB12_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    add sp, sp, #272
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0