Skip to content

Commit 441fb41

Browse files
jayfoadtru
authored andcommitted
[AMDGPU] GFX12 VMEM loads can write VGPR results out of order (#105549)
Fix SIInsertWaitcnts to account for this by adding extra waits to avoid WAW dependencies. (cherry picked from commit 5506831)
1 parent daea6b9 commit 441fb41

12 files changed

+64
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

+17-6
Original file line numberDiff line numberDiff line change
@@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
953953
"Export priority must be explicitly manipulated on GFX11.5"
954954
>;
955955

956+
def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
957+
"HasVmemWriteVgprInOrder",
958+
"true",
959+
"VMEM instructions of the same type write VGPR results in order"
960+
>;
961+
956962
//===------------------------------------------------------------===//
957963
// Subtarget Features (options and debugging)
958964
//===------------------------------------------------------------===//
@@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
11231129
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
11241130
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
11251131
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1126-
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
1132+
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
1133+
FeatureVmemWriteVgprInOrder
11271134
]
11281135
>;
11291136

@@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
11361143
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
11371144
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
11381145
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
1139-
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
1146+
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
1147+
FeatureVmemWriteVgprInOrder
11401148
]
11411149
>;
11421150

@@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
11521160
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
11531161
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
11541162
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
1155-
FeatureDefaultComponentZero
1163+
FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
11561164
]
11571165
>;
11581166

@@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
11701178
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
11711179
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
11721180
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1173-
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
1181+
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
1182+
FeatureVmemWriteVgprInOrder
11741183
]
11751184
>;
11761185

@@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
11931202
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
11941203
FeatureMaxHardClauseLength63,
11951204
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
1196-
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
1205+
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
1206+
FeatureVmemWriteVgprInOrder
11971207
]
11981208
>;
11991209

@@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
12151225
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
12161226
FeatureGWS, FeatureDefaultComponentZero,
12171227
FeatureMaxHardClauseLength32,
1218-
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
1228+
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
1229+
FeatureVmemWriteVgprInOrder
12191230
]
12201231
>;
12211232

llvm/lib/Target/AMDGPU/GCNSubtarget.h

+3
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
239239
bool HasVALUTransUseHazard = false;
240240
bool HasForceStoreSC0SC1 = false;
241241
bool HasRequiredExportPriority = false;
242+
bool HasVmemWriteVgprInOrder = false;
242243

243244
bool RequiresCOV6 = false;
244245

@@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12851286

12861287
bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
12871288

1289+
bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1290+
12881291
/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
12891292
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
12901293
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17781778
if (IsVGPR) {
17791779
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
17801780
// previous write and this write are the same type of VMEM
1781-
// instruction, in which case they're guaranteed to write their
1782-
// results in order anyway.
1781+
// instruction, in which case they are (in some architectures)
1782+
// guaranteed to write their results in order anyway.
17831783
if (Op.isUse() || !updateVMCntOnly(MI) ||
17841784
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1785-
getVmemType(MI))) {
1785+
getVmemType(MI)) ||
1786+
!ST->hasVmemWriteVgprInOrder()) {
17861787
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17871788
ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
17881789
ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

+3
Original file line numberDiff line numberDiff line change
@@ -1398,6 +1398,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
13981398
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
13991399
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
14001400
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
1401+
; GFX12-NEXT: s_wait_loadcnt 0x0
14011402
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
14021403
; GFX12-NEXT: ; implicit-def: $vgpr4
14031404
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2662,6 +2663,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
26622663
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
26632664
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
26642665
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
2666+
; GFX12-NEXT: s_wait_loadcnt 0x0
26652667
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
26662668
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
26672669
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -4141,6 +4143,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
41414143
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
41424144
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
41434145
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
4146+
; GFX12-NEXT: s_wait_loadcnt 0x0
41444147
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
41454148
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
41464149
; GFX12-NEXT: s_cbranch_execnz .LBB11_1

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll

+5
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
12551255
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
12561256
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
12571257
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
1258+
; GFX12-NEXT: s_wait_loadcnt 0x0
12581259
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
12591260
; GFX12-NEXT: ; implicit-def: $vgpr4
12601261
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
24492450
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
24502451
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
24512452
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
2453+
; GFX12-NEXT: s_wait_loadcnt 0x0
24522454
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
24532455
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
24542456
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
39493951
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
39503952
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
39513953
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
3954+
; GFX12-NEXT: s_wait_loadcnt 0x0
39523955
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
39533956
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
39543957
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
@@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
53195322
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
53205323
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
53215324
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5325+
; GFX12-NEXT: s_wait_loadcnt 0x0
53225326
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
53235327
; GFX12-NEXT: ; implicit-def: $vgpr4
53245328
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
68126816
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
68136817
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
68146818
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
6819+
; GFX12-NEXT: s_wait_loadcnt 0x0
68156820
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
68166821
; GFX12-NEXT: ; implicit-def: $vgpr4
68176822
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll

+5
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
12551255
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
12561256
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
12571257
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
1258+
; GFX12-NEXT: s_wait_loadcnt 0x0
12581259
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
12591260
; GFX12-NEXT: ; implicit-def: $vgpr4
12601261
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
24492450
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
24502451
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
24512452
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
2453+
; GFX12-NEXT: s_wait_loadcnt 0x0
24522454
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
24532455
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
24542456
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
39493951
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
39503952
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
39513953
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
3954+
; GFX12-NEXT: s_wait_loadcnt 0x0
39523955
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
39533956
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
39543957
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
@@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
53195322
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
53205323
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
53215324
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5325+
; GFX12-NEXT: s_wait_loadcnt 0x0
53225326
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
53235327
; GFX12-NEXT: ; implicit-def: $vgpr4
53245328
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
68126816
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
68136817
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
68146818
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
6819+
; GFX12-NEXT: s_wait_loadcnt 0x0
68156820
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
68166821
; GFX12-NEXT: ; implicit-def: $vgpr4
68176822
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll

+1
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
128128
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
129129
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
130130
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
131+
; GFX12-NEXT: s_wait_loadcnt 0x0
131132
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
132133
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
133134
; GFX12-NEXT: ; implicit-def: $vgpr4

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

+9-1
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
745745
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
746746
; GFX12-NEXT: v_mov_b32_e32 v8, 0
747747
; GFX12-NEXT: s_wait_kmcnt 0x0
748-
; GFX12-NEXT: s_clause 0xf
748+
; GFX12-NEXT: s_clause 0x7
749749
; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
750750
; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
751751
; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
@@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
754754
; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
755755
; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
756756
; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
757+
; GFX12-NEXT: s_wait_loadcnt 0x7
757758
; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
759+
; GFX12-NEXT: s_wait_loadcnt 0x7
758760
; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
761+
; GFX12-NEXT: s_wait_loadcnt 0x7
759762
; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
763+
; GFX12-NEXT: s_wait_loadcnt 0x7
760764
; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
765+
; GFX12-NEXT: s_wait_loadcnt 0x7
761766
; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
767+
; GFX12-NEXT: s_wait_loadcnt 0x7
762768
; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
769+
; GFX12-NEXT: s_wait_loadcnt 0x7
763770
; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
771+
; GFX12-NEXT: s_wait_loadcnt 0x7
764772
; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
765773
; GFX12-NEXT: s_wait_loadcnt 0x4
766774
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

+10
Original file line numberDiff line numberDiff line change
@@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
35633563
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
35643564
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
35653565
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
3566+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35663567
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
35673568
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
3569+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35683570
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
35693571
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35703572
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
35713573
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
35723574
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
3575+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35733576
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
35743577
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3578+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35753579
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
35763580
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
35773581
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
43714375
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
43724376
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
43734377
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
4378+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
43744379
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
43754380
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
4381+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
43764382
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
43774383
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
43784384
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
73417347
; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
73427348
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
73437349
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
7350+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73447351
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
73457352
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
7353+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73467354
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
73477355
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73487356
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39
@@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
73647372
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
73657373
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
73667374
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
7375+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73677376
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
73687377
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
7378+
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73697379
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
73707380
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
73717381
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32

llvm/test/CodeGen/AMDGPU/load-global-i32.ll

+2
Original file line numberDiff line numberDiff line change
@@ -3091,8 +3091,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
30913091
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
30923092
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
30933093
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
3094+
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
30943095
; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
30953096
; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3097+
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
30963098
; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
30973099
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
30983100
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208

llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
1717
; GCN: s_xor_saveexec_b64
18+
; GCN-NEXT: s_waitcnt vmcnt(0)
1819
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
1920
; GCN-NEXT: s_mov_b64 exec, -1
2021
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload

llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir

+4-4
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ body: |
297297
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
298298
# GFX12: S_WAIT_LOADCNT 0
299299
# GFX12-LABEL: bb.1:
300-
# GFX12-NOT: S_WAIT_LOADCNT 0
300+
# GFX12: S_WAIT_LOADCNT 0
301301
# GFX12-LABEL: bb.2:
302302
name: waitcnt_vm_loop2
303303
body: |
@@ -344,7 +344,7 @@ body: |
344344
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
345345
# GFX12: S_WAIT_LOADCNT 0
346346
# GFX12-LABEL: bb.1:
347-
# GFX12-NOT: S_WAIT_LOADCNT 0
347+
# GFX12: S_WAIT_LOADCNT 0
348348
# GFX12-LABEL: bb.2:
349349
name: waitcnt_vm_loop2_store
350350
body: |
@@ -445,7 +445,7 @@ body: |
445445
# GFX12-LABEL: bb.1:
446446
# GFX12-NOT: S_WAIT_LOADCNT 0
447447
# GFX12-LABEL: bb.2:
448-
# GFX12-NOT: S_WAIT_LOADCNT 0
448+
# GFX12: S_WAIT_LOADCNT 0
449449
# GFX12-LABEL: bb.3:
450450
name: waitcnt_vm_loop2_nowait
451451
body: |
@@ -602,7 +602,7 @@ body: |
602602
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
603603
# GFX12: S_WAIT_LOADCNT 0
604604
# GFX12-LABEL: bb.1:
605-
# GFX12-NOT: S_WAIT_LOADCNT 0
605+
# GFX12: S_WAIT_LOADCNT 0
606606
# GFX12-LABEL: bb.2:
607607

608608
name: waitcnt_vm_zero

0 commit comments

Comments
 (0)