From 410a5e3aef181c54038da3889dc623a2c762b722 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Thu, 9 May 2024 13:07:35 -0400 Subject: [PATCH 1/3] [AMDGPU] Fix broken MIR generated by gfx11 simulated trap lowering This was breaking the CFG connection between uses of virtual registers after the trap and their definitions before it. Fixes SWDEV-460384. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 18 ++- .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 59 ++++++- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 150 ++++++++++++++++++ 3 files changed, 222 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6599d0abd135c..17e235291e37b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2065,14 +2065,26 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, .addImm(AMDGPU::SendMsg::ID_INTERRUPT); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(AMDGPU::TTMP2); - BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); + + if (MBB.succ_empty()) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); + } else { + // HACK: There are some instructions following the trap. Since uses of + // virtual registers in SplitBB (or beyond) that were defined before the + // trap must be dominated by their definitions, we need SplitBB to be a + // successor (even though it's unreachable in practice). This needs to be + // represented by a dummy cmp_eq and cbranch to convince analyzeBranch that + // SplitBB should indeed be considered a successor. + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_EQ_U32)) + .addUse(SetWaveAbortBit) + .addUse(SetWaveAbortBit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_SCC1)).addMBB(HaltLoop); + } BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH)) .addMBB(HaltLoop); - if (SplitBB != &MBB) - MBB.removeSuccessor(SplitBB); MBB.addSuccessor(HaltLoop); HaltLoop->addSuccessor(HaltLoop); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index ac98dca00be3d..e217a8be9597d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -8,7 +8,7 @@ name: test_trap body: | bb.0: ; GFX1100-LABEL: name: test_trap - ; GFX1100: successors: %bb.2(0x80000000) + ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 @@ -21,7 +21,8 @@ body: | ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 - ; GFX1100-NEXT: S_BRANCH %bb.2 + ; GFX1100-NEXT: S_CMP_EQ_U32 [[S_OR_B32_]], [[S_OR_B32_]], implicit-def $scc + ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: .1: ; GFX1100-NEXT: successors: @@ -45,5 +46,59 @@ body: | G_STORE %0, %1 :: (store 1, addrspace 1) G_TRAP G_STORE %0, %1 :: (store 1, addrspace 1) +... + +--- +name: test_fallthrough_trap +body: | + ; GFX1100-LABEL: name: test_fallthrough_trap + ; GFX1100: bb.0: + ; GFX1100-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_TRAP 2 + ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 + ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 + ; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc + ; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc + ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] + ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 + ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 + ; GFX1100-NEXT: S_CMP_EQ_U32 [[S_OR_B32_]], [[S_OR_B32_]], implicit-def $scc + ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.2: + ; GFX1100-NEXT: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: S_SETHALT 5 + ; GFX1100-NEXT: S_BRANCH %bb.2 + ; + ; GFX1150-LABEL: name: test_fallthrough_trap + ; GFX1150: bb.0: + ; GFX1150-NEXT: successors: %bb.1(0x80000000) + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1150-NEXT: S_TRAP 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.1: + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + bb.0: + successors: %bb.1 + + %0:_(s8) = G_CONSTANT i8 0 + %1:_(p1) = G_CONSTANT i64 0 + G_STORE %0, %1 :: (store 1, addrspace 1) + G_TRAP + bb.1: + G_STORE %0, %1 :: (store 1, addrspace 1) ... diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index dcc5fbd142c42..cfc1b47671b07 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -264,6 +264,142 @@ ret: ret void } +define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { +; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after: +; NOHSA-TRAP-GFX900: ; %bb.0: +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: +; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: .LBB2_2: +; NOHSA-TRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-LABEL: trap_with_use_after: +; HSA-TRAP-GFX803: ; %bb.0: +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 +; HSA-TRAP-GFX803-NEXT: flat_load_dword v2, v[0:1] glc +; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s6 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s7 +; HSA-TRAP-GFX803-NEXT: s_trap 2 +; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-NEXT: s_endpgm +; +; HSA-TRAP-GFX900-LABEL: trap_with_use_after: +; HSA-TRAP-GFX900: ; %bb.0: +; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-NEXT: s_trap 2 +; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after: +; HSA-NOTRAP-GFX900: ; %bb.0: +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; HSA-NOTRAP-GFX900-NEXT: ; %bb.1: +; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: .LBB2_2: +; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-LABEL: trap_with_use_after: +; HSA-TRAP-GFX1100: ; %bb.0: +; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10 +; HSA-TRAP-GFX1100-NEXT: s_cmp_eq_u32 s0, s0 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-NEXT: s_cbranch_scc1 .LBB2_2 +; HSA-TRAP-GFX1100-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_nop 0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; HSA-TRAP-GFX1100-NEXT: s_endpgm +; HSA-TRAP-GFX1100-NEXT: .LBB2_2: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_2 +; +; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after: +; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s3, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-O0-NEXT: s_cmp_eq_u32 s0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_scc1 .LBB2_2 +; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 +; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm +; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB2_2 + %tmp = load volatile i32, ptr addrspace(1) %arg0 + call void @llvm.trap() + store volatile i32 %tmp, ptr addrspace(1) %arg1 + ret void +} + define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: debugtrap: ; NOHSA-TRAP-GFX900: ; %bb.0: @@ -334,6 +470,20 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX1100-NEXT: s_nop 0 ; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; HSA-TRAP-GFX1100-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-O0-LABEL: debugtrap: +; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 3 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 2 +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.debugtrap() store volatile i32 2, ptr addrspace(1) %arg0 From d8bf2096de6b96f633f93b3fe2f05c4e534bb619 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Fri, 10 May 2024 11:55:05 -0400 Subject: [PATCH 2/3] Address review comments --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 39 ++++++++++--------- .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 18 ++++----- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 17e235291e37b..b583e3c078715 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2033,10 +2033,6 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, MachineInstr &MI, const DebugLoc &DL) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); - MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); - MF->push_back(HaltLoop); - constexpr unsigned DoorbellIDMask = 0x3ff; constexpr unsigned ECQueueWaveAbort = 0x400; @@ -2066,27 +2062,32 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(AMDGPU::TTMP2); - if (MBB.succ_empty()) { - BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); - } else { - // HACK: There are some instructions following the trap. Since uses of - // virtual registers in SplitBB (or beyond) that were defined before the - // trap must be dominated by their definitions, we need SplitBB to be a - // successor (even though it's unreachable in practice). This needs to be - // represented by a dummy cmp_eq and cbranch to convince analyzeBranch that - // SplitBB should indeed be considered a successor. - BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_EQ_U32)) - .addUse(SetWaveAbortBit) - .addUse(SetWaveAbortBit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_SCC1)).addMBB(HaltLoop); - } + MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); + MF->push_back(HaltLoop); + HaltLoop->addSuccessor(HaltLoop); BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH)) .addMBB(HaltLoop); + if (MBB.succ_empty() && std::next(MI.getIterator()) == MBB.end()) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); + MBB.addSuccessor(HaltLoop); + return &MBB; + } + + // HACK: There are some instructions/successors following the trap. Since uses + // of virtual registers after the trap that were defined before the trap must + // be dominated by their definitions, we need the uses to be successors (even + // though they're unreachable in practice). This needs to be represented by a + // dummy cmp_eq and cbranch to convince analyzeBranch that SplitBB should + // indeed be considered a successor. + MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_EQ_U32)) + .addUse(SetWaveAbortBit) + .addUse(SetWaveAbortBit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_SCC1)).addMBB(HaltLoop); MBB.addSuccessor(HaltLoop); - HaltLoop->addSuccessor(HaltLoop); return SplitBB; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index e217a8be9597d..532866ca49996 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -1,14 +1,14 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s --- name: test_trap body: | bb.0: ; GFX1100-LABEL: name: test_trap - ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX1100: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 @@ -22,18 +22,18 @@ body: | ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 ; GFX1100-NEXT: S_CMP_EQ_U32 [[S_OR_B32_]], [[S_OR_B32_]], implicit-def $scc - ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .1: + ; GFX1100-NEXT: .2: ; GFX1100-NEXT: successors: ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .2: - ; GFX1100-NEXT: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: .1: + ; GFX1100-NEXT: successors: %bb.1(0x80000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_SETHALT 5 - ; GFX1100-NEXT: S_BRANCH %bb.2 + ; GFX1100-NEXT: S_BRANCH %bb.1 ; ; GFX1150-LABEL: name: test_trap ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 From 8b1ce80134dd2c60464d01e5d168f791af1a524b Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Mon, 13 May 2024 11:52:49 -0400 Subject: [PATCH 3/3] Address review comments --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 64 +++++++++---------- .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 50 +++++++++------ llvm/test/CodeGen/AMDGPU/trap-abis.ll | 48 +++++++------- 3 files changed, 82 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b583e3c078715..7dd8a2a47e0e1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2036,60 +2036,54 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, constexpr unsigned DoorbellIDMask = 0x3ff; constexpr unsigned ECQueueWaveAbort = 0x400; + MachineBasicBlock *TrapBB = &MBB; + MachineBasicBlock *ContBB = &MBB; + MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock(); + + if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) { + ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); + TrapBB = MF->CreateMachineBasicBlock(); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB); + MF->push_back(TrapBB); + MBB.addSuccessor(TrapBB); + } + // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this // will be a nop. - BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP)) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP)) .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32), + DoorbellReg) .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) .addUse(AMDGPU::M0); Register DoorbellRegMasked = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked) .addUse(DoorbellReg) .addImm(DoorbellIDMask); Register SetWaveAbortBit = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) .addUse(DoorbellRegMasked) .addImm(ECQueueWaveAbort); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(SetWaveAbortBit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG)) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG)) .addImm(AMDGPU::SendMsg::ID_INTERRUPT); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(AMDGPU::TTMP2); + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB); + TrapBB->addSuccessor(HaltLoopBB); - MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); - MF->push_back(HaltLoop); - HaltLoop->addSuccessor(HaltLoop); - - BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); - BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH)) - .addMBB(HaltLoop); - - if (MBB.succ_empty() && std::next(MI.getIterator()) == MBB.end()) { - BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); - MBB.addSuccessor(HaltLoop); - return &MBB; - } - - // HACK: There are some instructions/successors following the trap. Since uses - // of virtual registers after the trap that were defined before the trap must - // be dominated by their definitions, we need the uses to be successors (even - // though they're unreachable in practice). This needs to be represented by a - // dummy cmp_eq and cbranch to convince analyzeBranch that SplitBB should - // indeed be considered a successor. - MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); - BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_EQ_U32)) - .addUse(SetWaveAbortBit) - .addUse(SetWaveAbortBit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_SCC1)).addMBB(HaltLoop); - MBB.addSuccessor(HaltLoop); + BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); + BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH)) + .addMBB(HaltLoopBB); + MF->push_back(HaltLoopBB); + HaltLoopBB->addSuccessor(HaltLoopBB); - return SplitBB; + return ContBB; } unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index 532866ca49996..e3d31c702482f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -8,11 +8,21 @@ name: test_trap body: | bb.0: ; GFX1100-LABEL: name: test_trap - ; GFX1100: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .2: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) + ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_TRAP 2 ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 @@ -21,19 +31,13 @@ body: | ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 - ; GFX1100-NEXT: S_CMP_EQ_U32 [[S_OR_B32_]], [[S_OR_B32_]], implicit-def $scc - ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .2: - ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: S_BRANCH %bb.3 ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .1: - ; GFX1100-NEXT: successors: %bb.1(0x80000000) + ; GFX1100-NEXT: .3: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_SETHALT 5 - ; GFX1100-NEXT: S_BRANCH %bb.1 + ; GFX1100-NEXT: S_BRANCH %bb.3 ; ; GFX1150-LABEL: name: test_trap ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -58,6 +62,16 @@ body: | ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.2: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) + ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_TRAP 2 ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 @@ -66,19 +80,13 @@ body: | ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 - ; GFX1100-NEXT: S_CMP_EQ_U32 [[S_OR_B32_]], [[S_OR_B32_]], implicit-def $scc - ; GFX1100-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: bb.1: - ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: S_BRANCH %bb.3 ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: bb.2: - ; GFX1100-NEXT: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: bb.3: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_SETHALT 5 - ; GFX1100-NEXT: S_BRANCH %bb.2 + ; GFX1100-NEXT: S_BRANCH %bb.3 ; ; GFX1150-LABEL: name: test_fallthrough_trap ; GFX1150: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index cfc1b47671b07..7dce633e9186a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -325,30 +325,30 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100: ; %bb.0: ; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 -; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc ; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_cbranch_execnz .LBB2_2 +; HSA-TRAP-GFX1100-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_nop 0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; HSA-TRAP-GFX1100-NEXT: s_endpgm +; HSA-TRAP-GFX1100-NEXT: .LBB2_2: ; HSA-TRAP-GFX1100-NEXT: s_trap 2 ; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff ; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10 -; HSA-TRAP-GFX1100-NEXT: s_cmp_eq_u32 s0, s0 ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 ; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 -; HSA-TRAP-GFX1100-NEXT: s_cbranch_scc1 .LBB2_2 -; HSA-TRAP-GFX1100-NEXT: ; %bb.1: -; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc -; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-NEXT: s_nop 0 -; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; HSA-TRAP-GFX1100-NEXT: s_endpgm -; HSA-TRAP-GFX1100-NEXT: .LBB2_2: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 -; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_2 +; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_3 ; ; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100-O0: ; %bb.0: @@ -366,17 +366,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill -; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 -; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) -; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 -; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff -; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 -; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 -; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) -; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 -; HSA-TRAP-GFX1100-O0-NEXT: s_cmp_eq_u32 s0, s0 -; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_scc1 .LBB2_2 +; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_execnz .LBB2_2 ; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload @@ -391,9 +381,19 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm -; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 -; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB2_2 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB2_3 %tmp = load volatile i32, ptr addrspace(1) %arg0 call void @llvm.trap() store volatile i32 %tmp, ptr addrspace(1) %arg1