Skip to content

Commit

Permalink
[AMDGPU] GFX12 VMEM loads can write VGPR results out of order (llvm#1…
Browse files Browse the repository at this point in the history
…05549)

Fix SIInsertWaitcnts to account for this by adding extra waits to avoid
WAW dependencies.

(cherry picked from commit 5506831)
  • Loading branch information
jayfoad authored and llvmbot committed Aug 23, 2024
1 parent 720021a commit ed89fa6
Show file tree
Hide file tree
Showing 12 changed files with 64 additions and 14 deletions.
23 changes: 17 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
"Export priority must be explicitly manipulated on GFX11.5"
>;

def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
"HasVmemWriteVgprInOrder",
"true",
"VMEM instructions of the same type write VGPR results in order"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureVmemWriteVgprInOrder
]
>;

Expand All @@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
FeatureVmemWriteVgprInOrder
]
>;

Expand All @@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
FeatureDefaultComponentZero
FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
]
>;

Expand All @@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
FeatureVmemWriteVgprInOrder
]
>;

Expand All @@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
FeatureVmemWriteVgprInOrder
]
>;

Expand All @@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength32,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
FeatureVmemWriteVgprInOrder
]
>;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasVALUTransUseHazard = false;
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
bool HasVmemWriteVgprInOrder = false;

bool RequiresCOV6 = false;

Expand Down Expand Up @@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }

bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }

/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
// instruction, in which case they're guaranteed to write their
// results in order anyway.
// instruction, in which case they are (in some architectures)
// guaranteed to write their results in order anyway.
if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
getVmemType(MI))) {
getVmemType(MI)) ||
!ST->hasVmemWriteVgprInOrder()) {
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1398,6 +1398,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down Expand Up @@ -2662,6 +2663,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
Expand Down Expand Up @@ -4141,6 +4143,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down Expand Up @@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
Expand Down Expand Up @@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
Expand Down Expand Up @@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down Expand Up @@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down Expand Up @@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
Expand Down Expand Up @@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
Expand Down Expand Up @@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down Expand Up @@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
Expand Down
10 changes: 9 additions & 1 deletion llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0xf
; GFX12-NEXT: s_clause 0x7
; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
Expand All @@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
Expand Down
10 changes: 10 additions & 0 deletions llvm/test/CodeGen/AMDGPU/load-global-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
Expand Down Expand Up @@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
Expand Down Expand Up @@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39
Expand All @@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/load-global-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3091,8 +3091,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
; GCN: s_xor_saveexec_b64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name: waitcnt_vm_loop2
body: |
Expand Down Expand Up @@ -344,7 +344,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name: waitcnt_vm_loop2_store
body: |
Expand Down Expand Up @@ -445,7 +445,7 @@ body: |
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.3:
name: waitcnt_vm_loop2_nowait
body: |
Expand Down Expand Up @@ -602,7 +602,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:

name: waitcnt_vm_zero
Expand Down

0 comments on commit ed89fa6

Please sign in to comment.