diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6599d0abd135c..7dd8a2a47e0e1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2033,50 +2033,57 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, MachineInstr &MI, const DebugLoc &DL) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); - MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); - MF->push_back(HaltLoop); - constexpr unsigned DoorbellIDMask = 0x3ff; constexpr unsigned ECQueueWaveAbort = 0x400; + MachineBasicBlock *TrapBB = &MBB; + MachineBasicBlock *ContBB = &MBB; + MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock(); + + if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) { + ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); + TrapBB = MF->CreateMachineBasicBlock(); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB); + MF->push_back(TrapBB); + MBB.addSuccessor(TrapBB); + } + // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this // will be a nop. - BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP)) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP)) .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32), + DoorbellReg) .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) .addUse(AMDGPU::M0); Register DoorbellRegMasked = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked) .addUse(DoorbellReg) .addImm(DoorbellIDMask); Register SetWaveAbortBit = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) .addUse(DoorbellRegMasked) .addImm(ECQueueWaveAbort); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(SetWaveAbortBit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG)) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG)) .addImm(AMDGPU::SendMsg::ID_INTERRUPT); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(AMDGPU::TTMP2); - BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); - - BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); - BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH)) - .addMBB(HaltLoop); + BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB); + TrapBB->addSuccessor(HaltLoopBB); - if (SplitBB != &MBB) - MBB.removeSuccessor(SplitBB); - MBB.addSuccessor(HaltLoop); - HaltLoop->addSuccessor(HaltLoop); + BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); + BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH)) + .addMBB(HaltLoopBB); + MF->push_back(HaltLoopBB); + HaltLoopBB->addSuccessor(HaltLoopBB); - return SplitBB; + return ContBB; } unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index ac98dca00be3d..e3d31c702482f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -1,18 +1,28 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s --- name: test_trap body: | bb.0: ; GFX1100-LABEL: name: test_trap - ; GFX1100: successors: %bb.2(0x80000000) + ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .2: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) + ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_TRAP 2 ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 @@ -21,18 +31,13 @@ body: | ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 - ; GFX1100-NEXT: S_BRANCH %bb.2 - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .1: - ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: S_BRANCH %bb.3 ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) - ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: .2: - ; GFX1100-NEXT: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: .3: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) ; GFX1100-NEXT: {{ $}} ; GFX1100-NEXT: S_SETHALT 5 - ; GFX1100-NEXT: S_BRANCH %bb.2 + ; GFX1100-NEXT: S_BRANCH %bb.3 ; ; GFX1150-LABEL: name: test_trap ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -45,5 +50,63 @@ body: | G_STORE %0, %1 :: (store 1, addrspace 1) G_TRAP G_STORE %0, %1 :: (store 1, addrspace 1) +... + +--- +name: test_fallthrough_trap +body: | + ; GFX1100-LABEL: name: test_fallthrough_trap + ; GFX1100: bb.0: + ; GFX1100-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.2: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: S_TRAP 2 + ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 + ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 + ; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc + ; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc + ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] + ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 + ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 + ; GFX1100-NEXT: S_BRANCH %bb.3 + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: bb.3: + ; GFX1100-NEXT: successors: %bb.3(0x80000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: S_SETHALT 5 + ; GFX1100-NEXT: S_BRANCH %bb.3 + ; + ; GFX1150-LABEL: name: test_fallthrough_trap + ; GFX1150: bb.0: + ; GFX1150-NEXT: successors: %bb.1(0x80000000) + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1150-NEXT: S_TRAP 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.1: + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + bb.0: + successors: %bb.1 + + %0:_(s8) = G_CONSTANT i8 0 + %1:_(p1) = G_CONSTANT i64 0 + G_STORE %0, %1 :: (store 1, addrspace 1) + G_TRAP + bb.1: + G_STORE %0, %1 :: (store 1, addrspace 1) ... diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index dcc5fbd142c42..7dce633e9186a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -264,6 +264,142 @@ ret: ret void } +define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { +; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after: +; NOHSA-TRAP-GFX900: ; %bb.0: +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: +; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-NEXT: .LBB2_2: +; NOHSA-TRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-LABEL: trap_with_use_after: +; HSA-TRAP-GFX803: ; %bb.0: +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 +; HSA-TRAP-GFX803-NEXT: flat_load_dword v2, v[0:1] glc +; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s6 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s7 +; HSA-TRAP-GFX803-NEXT: s_trap 2 +; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-NEXT: s_endpgm +; +; HSA-TRAP-GFX900-LABEL: trap_with_use_after: +; HSA-TRAP-GFX900: ; %bb.0: +; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-NEXT: s_trap 2 +; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after: +; HSA-NOTRAP-GFX900: ; %bb.0: +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; HSA-NOTRAP-GFX900-NEXT: ; %bb.1: +; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3] +; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-NEXT: .LBB2_2: +; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-LABEL: trap_with_use_after: +; HSA-TRAP-GFX1100: ; %bb.0: +; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_cbranch_execnz .LBB2_2 +; HSA-TRAP-GFX1100-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_nop 0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; HSA-TRAP-GFX1100-NEXT: s_endpgm +; HSA-TRAP-GFX1100-NEXT: .LBB2_2: +; HSA-TRAP-GFX1100-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_3 +; +; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after: +; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s3, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_execnz .LBB2_2 +; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 +; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm +; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB2_3 + %tmp = load volatile i32, ptr addrspace(1) %arg0 + call void @llvm.trap() + store volatile i32 %tmp, ptr addrspace(1) %arg1 + ret void +} + define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: debugtrap: ; NOHSA-TRAP-GFX900: ; %bb.0: @@ -334,6 +470,20 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX1100-NEXT: s_nop 0 ; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; HSA-TRAP-GFX1100-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-O0-LABEL: debugtrap: +; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 3 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 2 +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.debugtrap() store volatile i32 2, ptr addrspace(1) %arg0