Skip to content

Commit

Permalink
[AMDGPU] Allow potentially negative flat scratch offsets on GFX12
Browse files Browse the repository at this point in the history
#70634 has disabled use
of potentially negative scratch offsets, but we can use it on GFX12.
  • Loading branch information
rampitec authored and jayfoad committed Jan 15, 2024
1 parent cfa30fa commit aefd530
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 188 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;

// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
if (AMDGPU::isGFX12Plus(*Subtarget))
return true;

auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
Expand All @@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR + Imm.
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
if (AMDGPU::isGFX12Plus(*Subtarget))
return true;

auto Base = Addr.getOperand(0);
auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
// If the immediate offset is negative and within certain range, the base
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4586,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
if (isNoUnsignedWrap(AddrMI))
return true;

// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
if (AMDGPU::isGFX12Plus(STI))
return true;

Register LHS = AddrMI->getOperand(1).getReg();
Register RHS = AddrMI->getOperand(2).getReg();
return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
Expand All @@ -4595,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
// of: SGPR + VGPR + Imm.
bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
Register Addr) const {
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
if (AMDGPU::isGFX12Plus(STI))
return true;

MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
Register Base = AddrMI->getOperand(1).getReg();
std::optional<DefinitionAndSourceRegister> BaseDef =
Expand Down
105 changes: 51 additions & 54 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX12-LABEL: store_load_sindex_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 15
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_add_co_i32 s0, s0, 4
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -170,8 +171,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:128 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -248,14 +248,13 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-LABEL: store_load_vindex_foo:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down Expand Up @@ -391,17 +390,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_mov_b32_e32 v1, 15
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_addk_co_i32 s0, 0x104
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:260 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -490,13 +491,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:384 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -589,16 +590,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-LABEL: store_load_vindex_small_offset_foo:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down Expand Up @@ -697,17 +696,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_mov_b32_e32 v1, 15
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_addk_co_i32 s0, 0x4004
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16388 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -798,13 +799,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16512 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -899,16 +900,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-LABEL: store_load_vindex_large_offset_foo:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down Expand Up @@ -1154,11 +1153,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX12-NEXT: v_mov_b32_e32 v1, 15
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_endpgm
bb:
Expand Down
54 changes: 24 additions & 30 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -894,8 +894,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
;
; GFX12-LABEL: test_scratch_load_i8_zext_svs:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_u8 v0, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v0
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -931,8 +931,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
;
; GFX12-LABEL: test_scratch_load_i8_sext_svs:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_i8 v0, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v0
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -968,8 +968,8 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
;
; GFX12-LABEL: test_scratch_load_i16_zext_svs:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_u16 v0, v0, s0 offset:2
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v0
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1005,8 +1005,8 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
;
; GFX12-LABEL: test_scratch_load_i16_sext_svs:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: scratch_load_i16 v0, v0, s0 offset:2
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v0
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1046,9 +1046,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5)
;
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1090,9 +1089,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5)
;
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1134,9 +1132,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre
;
; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1178,9 +1175,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
;
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, -1
; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1222,9 +1218,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
;
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, -1
; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1266,9 +1261,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
;
; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX12-NEXT: v_mov_b32_e32 v3, -1
; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: flat_store_b32 v[1:2], v3
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -1309,9 +1303,9 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp
; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, off offset:4
; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4
; GFX12-NEXT: s_endpgm
bb:
%load = load <4 x i8>, ptr %in
Expand Down Expand Up @@ -1350,9 +1344,9 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs
; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, off offset:2
; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2
; GFX12-NEXT: s_endpgm
bb:
%load = load <2 x i16>, ptr %in
Expand Down
Loading

0 comments on commit aefd530

Please sign in to comment.