diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index d0043bcc920b6..4802ed4bb53df 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -328,7 +328,9 @@ bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { // taken from SIInstrInfo::isAlwaysGDS() bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); + return Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::DS_ADD_GS_REG_RTN || + Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } } // namespace llvm::mca diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 239f2664f59f3..213cc36b4381c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -349,6 +349,16 @@ class WaitcntBrackets { LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; } + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + // Return true if there might be pending writes to the vgpr-interval by VMEM // instructions with types different from V. bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { @@ -427,6 +437,8 @@ class WaitcntBrackets { unsigned PendingEvents = 0; // Remember the last flat memory operation. unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. int VgprUB = -1; @@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass { MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); + bool isNextENDPGM(MachineBasicBlock::instr_iterator It, + MachineBasicBlock *Block) const; + bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); }; @@ -1678,6 +1694,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } } + // Wait for any pending GDS instruction to complete before any + // "Always GDS" instruction. + if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) + addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); + if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { // The function is going to insert a wait on everything in its prolog. // This still needs to be careful if the call target is a load (e.g. a GOT @@ -1982,6 +2003,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) { Opc == AMDGPU::GLOBAL_WBINV; } +// Return true if the next instruction is S_ENDPGM, following fallthrough +// blocks if necessary. +bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, + MachineBasicBlock *Block) const { + auto BlockEnd = Block->getParent()->end(); + auto BlockIter = Block->getIterator(); + + while (true) { + if (It.isEnd()) { + if (++BlockIter != BlockEnd) { + It = BlockIter->instr_begin(); + continue; + } + + return false; + } + + if (!It->isMetaInstruction()) + break; + + It++; + } + + assert(!It.isEnd()); + + return It->getOpcode() == AMDGPU::S_ENDPGM; +} + +// Add a wait after an instruction if architecture requirements mandate one. +bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst, + MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets) { + AMDGPU::Waitcnt Wait; + bool NeedsEndPGMCheck = false; + + if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) + Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() && + !SIInstrInfo::isAtomicRet(Inst)); + + if (TII->isAlwaysGDS(Inst.getOpcode())) { + Wait.DsCnt = 0; + NeedsEndPGMCheck = true; + } + + ScoreBrackets.simplifyWaitcnt(Wait); + + auto SuccessorIt = std::next(Inst.getIterator()); + bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets, + /*OldWaitcntInstr=*/nullptr); + + if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) { + BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + } + + return Result; +} + void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access @@ -1994,6 +2073,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); + ScoreBrackets->setPendingGDS(); } else { ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } @@ -2124,6 +2204,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); + if (T == DS_CNT) + StrictDom |= mergeScore(M, LastGDS, Other.LastGDS); + for (int J = 0; J <= VgprUB; J++) StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); @@ -2249,13 +2332,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, updateEventWaitcntAfter(Inst, &ScoreBrackets); - if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { - AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( - Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); - ScoreBrackets.simplifyWaitcnt(Wait); - Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, - ScoreBrackets, /*OldWaitcntInstr=*/nullptr); - } + Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets); LLVM_DEBUG({ Inst.print(dbgs()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1e025f481ffa9..e383f6f2aef48 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4239,7 +4239,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, } bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); + return Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::DS_ADD_GS_REG_RTN || + Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { diff --git a/llvm/test/CodeGen/AMDGPU/force-wait-after-always-gds.mir b/llvm/test/CodeGen/AMDGPU/force-wait-after-always-gds.mir new file mode 100644 index 0000000000000..c5b8491e52a41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/force-wait-after-always-gds.mir @@ -0,0 +1,26 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s + +--- +# GCN-LABEL: name: test_ordered_count +# GCN: bb.0 +# GCN: DS_ADD_U32 +# GCN: DS_SUB_U32 +# GCN-NEXT: S_WAITCNT 64535 +# GCN-NEXT: $vgpr3 = DS_ORDERED_COUNT +# GCN-NEXT: S_WAITCNT 64519 +# GCN-NEXT: $vgpr4_vgpr5 = DS_ADD_GS_REG_RTN +# GCN-NEXT: S_WAITCNT 64519 +# GCN-NEXT: S_NOP 0 + +name: test_ordered_count +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + DS_ADD_U32 $vgpr1, $vgpr2, 12, -1, implicit $m0, implicit $exec :: (load store (s32), addrspace 3) + DS_SUB_U32 $vgpr1, $vgpr2, 12, 0, implicit $m0, implicit $exec :: (load store (s32), addrspace 2) + $vgpr3 = DS_ORDERED_COUNT $vgpr0, 772, implicit $m0, implicit $exec :: (load store (s32), addrspace 3) + $vgpr4_vgpr5 = DS_ADD_GS_REG_RTN $vgpr0, 32, implicit $m0, implicit $exec :: (load store (s32), addrspace 3) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll index 081727c3b5e14..9aedaaec94e7e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll @@ -9,6 +9,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) { ; CHECK-LABEL: test_add_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_endpgm %unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) ret void @@ -30,6 +32,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) { ; CHECK-LABEL: test_add_64: ; CHECK: ; %bb.0: ; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_endpgm %unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll index c9f089e248e76..234014fac9f5e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -26,11 +26,11 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value) ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[VALUE]] offset:4868 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: [[BB]]: ; // Wait for expcnt(0) before modifying EXEC ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]] -; GCN-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) { entry: %c = icmp ne i32 %value, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll index 63d4bac20b354..bb1c4604dd9d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll @@ -9,6 +9,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) { ; CHECK-LABEL: test_sub_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_endpgm %unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) ret void @@ -30,6 +32,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) { ; CHECK-LABEL: test_sub_64: ; CHECK: ; %bb.0: ; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_endpgm %unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) ret void