Skip to content

Commit 8662397

Browse files
authored
merge main into amd-staging (llvm#4592)
2 parents abac7bf + d4cedc5 commit 8662397

File tree

9 files changed

+216
-45
lines changed

9 files changed

+216
-45
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file This file contains a DAG scheduling mutation to add latency to
10+
/// barrier edges between ATOMIC_FENCE instructions and preceding
11+
/// memory accesses potentially affected by the fence.
12+
/// This encourages the scheduling of more instructions before
13+
/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
14+
/// introduce wait counting or indicate an impending S_BARRIER
15+
/// wait. Having more instructions in-flight across these
16+
/// constructs improves latency hiding.
17+
//
18+
//===----------------------------------------------------------------------===//
19+
20+
#include "AMDGPUBarrierLatency.h"
21+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "SIInstrInfo.h"
23+
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
24+
25+
using namespace llvm;
26+
27+
namespace {
28+
29+
class BarrierLatency : public ScheduleDAGMutation {
30+
public:
31+
BarrierLatency() = default;
32+
void apply(ScheduleDAGInstrs *DAG) override;
33+
};
34+
35+
void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
36+
constexpr unsigned SyntheticLatency = 2000;
37+
for (SUnit &SU : DAG->SUnits) {
38+
const MachineInstr *MI = SU.getInstr();
39+
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
40+
continue;
41+
42+
// Update latency on barrier edges of ATOMIC_FENCE.
43+
// We don't consider the scope of the fence or type of instruction
44+
// involved in the barrier edge.
45+
for (SDep &PredDep : SU.Preds) {
46+
if (!PredDep.isBarrier())
47+
continue;
48+
SUnit *PredSU = PredDep.getSUnit();
49+
MachineInstr *MI = PredSU->getInstr();
50+
// Only consider memory loads
51+
if (!MI->mayLoad() || MI->mayStore())
52+
continue;
53+
SDep ForwardD = PredDep;
54+
ForwardD.setSUnit(&SU);
55+
for (SDep &SuccDep : PredSU->Succs) {
56+
if (SuccDep == ForwardD) {
57+
SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
58+
break;
59+
}
60+
}
61+
PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
62+
PredSU->setDepthDirty();
63+
SU.setDepthDirty();
64+
}
65+
}
66+
}
67+
68+
} // end namespace
69+
70+
std::unique_ptr<ScheduleDAGMutation>
71+
llvm::createAMDGPUBarrierLatencyDAGMutation() {
72+
return std::make_unique<BarrierLatency>();
73+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
10+
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
11+
12+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
13+
#include <memory>
14+
15+
namespace llvm {
16+
17+
std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
18+
19+
} // namespace llvm
20+
21+
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "AMDGPUTargetMachine.h"
1818
#include "AMDGPU.h"
1919
#include "AMDGPUAliasAnalysis.h"
20+
#include "AMDGPUBarrierLatency.h"
2021
#include "AMDGPUCtorDtorLowering.h"
2122
#include "AMDGPUExportClustering.h"
2223
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -640,6 +641,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
640641
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
641642
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
642643
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
644+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
643645
return DAG;
644646
}
645647

@@ -660,6 +662,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
660662
if (ST.shouldClusterStores())
661663
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
662664
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
665+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
663666
return DAG;
664667
}
665668

@@ -1229,6 +1232,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
12291232
EnableVOPD)
12301233
DAG->addMutation(createVOPDPairingMutation());
12311234
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
1235+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
12321236
return DAG;
12331237
}
12341238
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
5252
AMDGPUAsmPrinter.cpp
5353
AMDGPUAtomicOptimizer.cpp
5454
AMDGPUAttributor.cpp
55+
AMDGPUBarrierLatency.cpp
5556
AMDGPUCallLowering.cpp
5657
AMDGPUCodeGenPrepare.cpp
5758
AMDGPUCombinerHelper.cpp

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24044,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
2404424044
}
2404524045
}
2404624046

24047-
std::pair<Register, const TargetRegisterClass *> Res =
24048-
TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
24049-
24050-
// If we picked one of the Zfinx register classes, remap it to the GPR class.
24051-
// FIXME: When Zfinx is supported in CodeGen this will need to take the
24052-
// Subtarget into account.
24053-
if (Res.second == &RISCV::GPRF16RegClass ||
24054-
Res.second == &RISCV::GPRF32RegClass ||
24055-
Res.second == &RISCV::GPRPairRegClass)
24056-
return std::make_pair(Res.first, &RISCV::GPRRegClass);
24057-
24058-
return Res;
24047+
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2405924048
}
2406024049

2406124050
InlineAsm::ConstraintCode

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
147147
; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
148148
; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0
149149
; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0)
150-
; GFX8-OPT-NEXT: s_barrier
151-
; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1
152-
; GFX8-OPT-NEXT: s_nop 1
153-
; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
154-
; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1
155-
; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1
150+
; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1
151+
; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1
156152
; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
157-
; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
153+
; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
154+
; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
155+
; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4
156+
; GFX8-OPT-NEXT: s_barrier
158157
; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2
159158
; GFX8-OPT-NEXT: s_endpgm
160159
;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
194193
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
195194
; GFX10-NEXT: v_mov_b32_e32 v2, 0
196195
; GFX10-NEXT: ds_read_b32 v1, v0
197-
; GFX10-NEXT: s_barrier
198-
; GFX10-NEXT: buffer_gl0_inv
199196
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
200197
; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
201-
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1
202-
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
203-
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
198+
; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1
204199
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
200+
; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
201+
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3
202+
; GFX10-NEXT: s_barrier
203+
; GFX10-NEXT: buffer_gl0_inv
205204
; GFX10-NEXT: flat_store_dword v[0:1], v2
206205
; GFX10-NEXT: s_endpgm
207206
;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
213212
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
214213
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
215214
; GFX11-NEXT: ds_load_b32 v1, v0
216-
; GFX11-NEXT: s_barrier
217-
; GFX11-NEXT: buffer_gl0_inv
218215
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
219216
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
220-
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
221-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
222-
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
223-
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
217+
; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1
224218
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
219+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
220+
; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
221+
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
222+
; GFX11-NEXT: s_barrier
223+
; GFX11-NEXT: buffer_gl0_inv
225224
; GFX11-NEXT: flat_store_b32 v[0:1], v2
226225
; GFX11-NEXT: s_endpgm
227226
bb:

0 commit comments

Comments
 (0)