diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e753b75dbbf492..291ec79e2d23dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1159,7 +1159,7 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative // values. - if (AMDGPU::isGFX12Plus(*Subtarget)) + if (Subtarget->hasSignedScratchOffsets()) return true; auto LHS = Addr.getOperand(0); @@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { if (isNoUnsignedWrap(Addr)) return true; + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (Subtarget->hasSignedScratchOffsets()) + return true; + auto LHS = Addr.getOperand(0); auto RHS = Addr.getOperand(1); return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); @@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { // Check address value in SGPR/VGPR are legal for flat scratch in the form // of: SGPR + VGPR + Imm. bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const { + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (AMDGPU::isGFX12Plus(*Subtarget)) + return true; + auto Base = Addr.getOperand(0); auto *RHSImm = cast(Addr.getOperand(1)); // If the immediate offset is negative and within certain range, the base diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 1d31c6b8fde93a..c94367cbfe8bb7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4557,7 +4557,7 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative // values. - if (AMDGPU::isGFX12Plus(STI)) + if (STI.hasSignedScratchOffsets()) return true; Register LHS = AddrMI->getOperand(1).getReg(); @@ -4586,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { if (isNoUnsignedWrap(AddrMI)) return true; + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (STI.hasSignedScratchOffsets()) + return true; + Register LHS = AddrMI->getOperand(1).getReg(); Register RHS = AddrMI->getOperand(2).getReg(); return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); @@ -4595,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { // of: SGPR + VGPR + Imm. bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( Register Addr) const { + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (STI.hasSignedScratchOffsets()) + return true; + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); Register Base = AddrMI->getOperand(1).getReg(); std::optional BaseDef = diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 070d165cdaadb8..73ce5972c68397 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1259,6 +1259,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // \returns true if the target has WG_RR_MODE kernel descriptor mode bit bool hasRrWGMode() const { return getGeneration() >= GFX12; } + /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative + /// values. + bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 4603fbcd525c78..0bc836fd1e830c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -79,16 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s1, s0, 2 +; GFX12-NEXT: s_and_b32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:4 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -170,8 +171,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:128 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -248,14 +248,13 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX12-LABEL: store_load_vindex_foo: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -391,17 +390,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b32 s1, s0, 2 +; GFX12-NEXT: s_and_b32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:260 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -490,13 +491,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:384 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -589,16 +590,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX12-LABEL: store_load_vindex_small_offset_foo: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -697,17 +696,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b32 s1, s0, 2 +; GFX12-NEXT: s_and_b32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16388 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -798,13 +799,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16512 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: @@ -899,16 +900,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX12-LABEL: store_load_vindex_large_offset_foo: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -1154,11 +1153,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll index 93e8630dc7f560..e9d3b9aae653ae 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -894,8 +894,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, ; ; GFX12-LABEL: test_scratch_load_i8_zext_svs: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_u8 v0, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm @@ -931,8 +931,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, ; ; GFX12-LABEL: test_scratch_load_i8_sext_svs: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_i8 v0, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm @@ -968,8 +968,8 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in ; ; GFX12-LABEL: test_scratch_load_i16_zext_svs: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_u16 v0, v0, s0 offset:2 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm @@ -1005,8 +1005,8 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in ; ; GFX12-LABEL: test_scratch_load_i16_sext_svs: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_i16 v0, v0, s0 offset:2 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm @@ -1046,9 +1046,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) ; ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 -; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 +; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1090,9 +1089,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) ; ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 -; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 +; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1134,9 +1132,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre ; ; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 -; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 +; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1178,9 +1175,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) ; ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, -1 -; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 +; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1222,9 +1218,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) ; ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, -1 -; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 +; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1266,9 +1261,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre ; ; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, -1 -; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 +; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2 ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm @@ -1309,9 +1303,9 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2 ; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, off offset:4 +; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4 ; GFX12-NEXT: s_endpgm bb: %load = load <4 x i8>, ptr %in @@ -1350,9 +1344,9 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2 ; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, off offset:2 +; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2 ; GFX12-NEXT: s_endpgm bb: %load = load <2 x i16>, ptr %in diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 04eb6dcff4632b..bff88a77009c91 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -95,12 +95,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -221,13 +221,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -349,13 +348,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -480,12 +478,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -609,17 +607,17 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-SDAG-LABEL: soff2_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -743,17 +741,17 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-SDAG-LABEL: soff2_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -879,12 +877,12 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -1008,17 +1006,17 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-SDAG-LABEL: soff4_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -1140,17 +1138,17 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-SDAG-LABEL: soff4_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT ; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 98379f5e3c68b4..f92a2d138e12ee 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -878,10 +878,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) @@ -945,10 +945,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT ; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2097,12 +2097,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) @@ -2178,12 +2177,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT ; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3328,12 +3326,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) @@ -3411,12 +3408,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT ; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3797,13 +3793,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, 4 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT ; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: s_endpgm ; @@ -3879,13 +3874,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT ; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT ; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX12-PAL-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 047cb3ab400084..ca029639923bd2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -394,13 +394,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX12-WGP-LABEL: private_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_clause 0x1 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT_HT ; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 @@ -409,13 +407,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX12-CU-LABEL: private_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_clause 0x1 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT_HT ; GFX12-CU-NEXT: s_waitcnt vmcnt(0) ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 @@ -794,10 +790,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_WB ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_store_1: @@ -805,10 +800,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_WB ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 4b1fb295adec2a..feeff499458ead 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -284,13 +284,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX12-WGP-LABEL: private_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_clause 0x1 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_RT_NT ; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 @@ -299,13 +297,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX12-CU-LABEL: private_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_clause 0x1 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_RT_NT ; GFX12-CU-NEXT: s_waitcnt vmcnt(0) ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 @@ -594,10 +590,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT +; GFX12-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_RT ; GFX12-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-WGP-NEXT: s_endpgm ; @@ -606,10 +601,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT +; GFX12-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_RT ; GFX12-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) {