diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ead3f51d6acdc5..fb8d685409e429 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1152,13 +1152,64 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, return CurDAG->SignBitIsZero(Base); } -bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base, - uint64_t FlatVariant) const { - if (FlatVariant != SIInstrFlags::FlatScratch) +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + +// Check that the base address of flat scratch load/store in the form of `base + +// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware +// requirement). We always treat the first operand as the base address here. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { + if (isNoUnsignedWrap(Addr)) return true; - // When value in 32-bit Base can be negative calculate scratch offset using - // 32-bit add instruction, otherwise use Base(unsigned) + offset. - return CurDAG->SignBitIsZero(Base); + + auto LHS = Addr.getOperand(0); + auto RHS = Addr.getOperand(1); + + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + ConstantSDNode *ImmOp = nullptr; + if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast(RHS))) { + if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000) + return true; + } + + return CurDAG->SignBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { + if (isNoUnsignedWrap(Addr)) + return true; + + auto LHS = Addr.getOperand(0); + auto RHS = Addr.getOperand(1); + return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR + Imm. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const { + auto Base = Addr.getOperand(0); + auto *RHSImm = cast(Addr.getOperand(1)); + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (isNoUnsignedWrap(Base) && + (isNoUnsignedWrap(Addr) || + (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000))) + return true; + + auto LHS = Base.getOperand(0); + auto RHS = Base.getOperand(1); + return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); } // TODO: If offset is too big, put low 16-bit into offset. @@ -1555,7 +1606,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; if (isBaseWithConstantOffset64(Addr, N0, N1) && - isFlatScratchBaseLegal(N0, FlatVariant)) { + (FlatVariant != SIInstrFlags::FlatScratch || + isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1787,8 +1839,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, int64_t COffsetVal = 0; - if (CurDAG->isBaseWithConstantOffset(Addr) && - isFlatScratchBaseLegal(Addr.getOperand(0))) { + if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) { COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); } else { @@ -1845,6 +1896,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, int64_t ImmOffset = 0; SDValue LHS, RHS; + SDValue OrigAddr = Addr; if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { int64_t COffsetVal = cast(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1866,7 +1918,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; - if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + if (!isFlatScratchBaseLegal(Addr)) return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; @@ -1892,8 +1944,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; } - if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) - return false; + if (OrigAddr != Addr) { + if (!isFlatScratchBaseLegalSVImm(OrigAddr)) + return false; + } else { + if (!isFlatScratchBaseLegalSV(OrigAddr)) + return false; + } if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index a8a606f60a3fae..618c5e02c09406 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -154,8 +154,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, unsigned Size) const; - bool isFlatScratchBaseLegal( - SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + + bool isFlatScratchBaseLegal(SDValue Addr) const; + bool isFlatScratchBaseLegalSV(SDValue Addr) const; + bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b772efe04c7141..fa91dc40931763 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4103,7 +4103,9 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, int64_t ConstOffset; std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant)) + + if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg()))) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -4266,7 +4268,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); - if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) && + if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { Addr = PtrBase; @@ -4343,6 +4345,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + Register OrigAddr = Addr; if (ConstOffset != 0 && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { Addr = PtrBase; @@ -4360,8 +4363,13 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); - if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS)) - return std::nullopt; + if (OrigAddr != Addr) { + if (!isFlatScratchBaseLegalSVImm(OrigAddr)) + return std::nullopt; + } else { + if (!isFlatScratchBaseLegalSV(OrigAddr)) + return std::nullopt; + } if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; @@ -4494,14 +4502,78 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, return KB->signBitIsZero(Base); } -bool AMDGPUInstructionSelector::isFlatScratchBaseLegal( - Register Base, uint64_t FlatVariant) const { - if (FlatVariant != SIInstrFlags::FlatScratch) +// Return whether the operation has NoUnsignedWrap property. +bool isNoUnsignedWrap(MachineInstr *Addr) { + return Addr->getOpcode() == TargetOpcode::G_OR || + (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && + Addr->getFlag(MachineInstr::NoUWrap)); +}; + +// Check that the base address of flat scratch load/store in the form of `base + +// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware +// requirement). We always treat the first operand as the base address here. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) return true; - // When value in 32-bit Base can be negative calculate scratch offset using - // 32-bit add instruction, otherwise use Base(unsigned) + offset. - return KB->signBitIsZero(Base); + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + + if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { + std::optional RhsValReg = + getIConstantVRegValWithLookThrough(RHS, *MRI); + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && + RhsValReg->Value.getSExtValue() > -0x40000000) + return true; + } + + return KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) + return true; + + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR + Imm. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( + Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + Register Base = AddrMI->getOperand(1).getReg(); + std::optional BaseDef = + getDefSrcRegIgnoringCopies(Base, *MRI); + std::optional RHSOffset = + getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); + assert(RHSOffset); + + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (isNoUnsignedWrap(BaseDef->MI) && + (isNoUnsignedWrap(AddrMI) || + (RHSOffset->Value.getSExtValue() < 0 && + RHSOffset->Value.getSExtValue() > -0x40000000))) + return true; + + Register LHS = BaseDef->MI->getOperand(1).getReg(); + Register RHS = BaseDef->MI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); } bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 93e45fcd8682f0..c93e3de66d4055 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -243,8 +243,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool isDSOffsetLegal(Register Base, int64_t Offset) const; bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1, unsigned Size) const; - bool isFlatScratchBaseLegal( - Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + bool isFlatScratchBaseLegal(Register Addr) const; + bool isFlatScratchBaseLegalSV(Register Addr) const; + bool isFlatScratchBaseLegalSVImm(Register Addr) const; std::pair selectDS1Addr1OffsetImpl(MachineOperand &Root) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 45df3bc094f351..ec2cd43e5fb5df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 4 -; GFX9-NEXT: scratch_store_dword v1, v3, off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX10-NEXT: v_add3_u32 v1, 4, v1, v2 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 15 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 4 -; GFX940-NEXT: scratch_store_dword v1, v3, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:4 dlc +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add3_u32 v1, 4, v1, v2 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX9-LABEL: private_ptr_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_ptr_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: private_ptr_foo: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_ptr_foo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1 store float 1.000000e+01, ptr addrspace(5) %gep, align 4 @@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX9-NEXT: scratch_store_dword v1, v3, off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 0x104, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -387,16 +377,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 -; GFX10-NEXT: v_add3_u32 v1, 0x104, v1, v2 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; @@ -405,30 +394,27 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 15 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX940-NEXT: scratch_store_dword v1, v3, off offset:260 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off offset:260 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_u32_e32 v0, 0x104, v0 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: v_add3_u32 v1, 0x104, v1, v2 -; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:260 dlc +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -631,16 +617,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX9-NEXT: scratch_store_dword v1, v3, off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -652,16 +636,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; GFX10-NEXT: v_add3_u32 v1, 0x4004, v1, v2 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; @@ -670,32 +653,29 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 15 -; GFX940-NEXT: s_movk_i32 s0, 0x4004 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX940-NEXT: scratch_store_dword v1, v3, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 15 +; GFX940-NEXT: s_movk_i32 s0, 0x4004 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c -; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: v_add3_u32 v1, 0x4004, v1, v2 -; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v3, s0 dlc +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -945,16 +925,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x400 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX9-NEXT: v_add3_u32 v0, v1, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -965,44 +943,40 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX10-NEXT: v_add3_u32 v0, 4, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x400 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: v_add3_u32 v0, v1, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, 4, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index ebd5e2e085632c..ea320cec9991d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -420,9 +420,7 @@ body: | ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec - ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -468,9 +466,7 @@ body: | ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec - ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -610,9 +606,7 @@ body: | ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec - ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -658,9 +652,7 @@ body: | ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec - ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index af023835c52977..329f0a2068cb07 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -704,11 +704,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; FLATSCR: ; %bb.0: ; %bb ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off -; FLATSCR-NEXT: v_add_u32_e32 v2, 2, v0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0] -; FLATSCR-NEXT: scratch_load_short_d16 v0, v2, off +; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep: @@ -726,22 +726,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; FLATSCR_GFX10: ; %bb.0: ; %bb ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off -; FLATSCR_GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) -; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0] -; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v0, v2, off +; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_private_other_dep: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0] -; GFX11-NEXT: scratch_load_d16_b16 v0, v2, off +; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll index c849cf08094e71..ad4d4a4a30fc6d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -11,16 +11,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX10-NEXT: scratch_load_ubyte v0, v0, off +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_zext_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-NEXT: scratch_load_u8 v0, v0, off +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -38,16 +36,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX10-NEXT: scratch_load_sbyte v0, v0, off +; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_sext_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-NEXT: scratch_load_i8 v0, v0, off +; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -65,16 +61,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX10-NEXT: scratch_load_ushort v0, v0, off +; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_zext_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_u16 v0, v0, off +; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -92,16 +86,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX10-NEXT: scratch_load_sshort v0, v0, off +; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_sext_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_i16 v0, v0, off +; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -359,16 +351,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: s_add_i32 s2, s2, 1 -; GFX10-NEXT: scratch_load_ubyte v2, off, s2 +; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_zext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_i32 s0, s0, 1 -; GFX11-NEXT: scratch_load_u8 v2, off, s0 +; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -386,16 +376,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: s_add_i32 s2, s2, 1 -; GFX10-NEXT: scratch_load_sbyte v2, off, s2 +; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_sext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_i32 s0, s0, 1 -; GFX11-NEXT: scratch_load_i8 v2, off, s0 +; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -413,16 +401,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: s_add_i32 s2, s2, 2 -; GFX10-NEXT: scratch_load_ushort v2, off, s2 +; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_zext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_i32 s0, s0, 2 -; GFX11-NEXT: scratch_load_u16 v2, off, s0 +; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -440,16 +426,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: s_add_i32 s2, s2, 2 -; GFX10-NEXT: scratch_load_sshort v2, off, s2 +; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_sext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_i32 s0, s0, 2 -; GFX11-NEXT: scratch_load_i16 v2, off, s0 +; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -713,19 +697,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 -; GFX10-NEXT: scratch_load_ubyte v0, v0, off +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_zext_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 -; GFX11-NEXT: scratch_load_u8 v0, v0, off +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -745,19 +726,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 -; GFX10-NEXT: scratch_load_sbyte v0, v0, off +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_sext_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 -; GFX11-NEXT: scratch_load_i8 v0, v0, off +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -777,19 +755,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2 -; GFX10-NEXT: scratch_load_ushort v0, v0, off +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_zext_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 -; GFX11-NEXT: scratch_load_u16 v0, v0, off +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm @@ -809,19 +784,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2 -; GFX10-NEXT: scratch_load_sshort v0, v0, off +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_sext_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 -; GFX11-NEXT: scratch_load_i16 v0, v0, off +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 2e01c0064ef7c1..8d28c7845167a2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -401,12 +401,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 -; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -446,14 +445,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -503,12 +501,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -548,14 +545,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -702,12 +698,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 -; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -747,14 +742,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -800,16 +794,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_or_b32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v3, off offset:2 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -843,20 +835,18 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v3, v4, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 07b3df2a8520aa..f1a2fe88677533 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -576,11 +576,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -592,24 +591,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 4, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 4, v0 -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -628,8 +625,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; @@ -640,8 +636,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 -; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -658,24 +653,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v1, 4, v0 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 4, v0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -798,58 +791,53 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX9-LABEL: private_ptr_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_ptr_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_ptr_foo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: private_ptr_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: private_ptr_foo: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: private_ptr_foo: ; GFX10-PAL: ; %bb.0: ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: private_ptr_foo: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1 store float 1.000000e+01, ptr addrspace(5) %gep, align 4 @@ -1554,11 +1542,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -1572,26 +1559,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -1610,11 +1594,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; @@ -1627,8 +1610,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 -; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -1648,12 +1630,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX1010-PAL-NEXT: scratch_store_dword v0, v2, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: s_endpgm ; @@ -1672,26 +1653,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX1030-PAL-NEXT: scratch_store_dword v0, v2, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1030-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -2595,11 +2573,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -2613,27 +2590,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -2652,11 +2626,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; @@ -2670,8 +2643,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 -; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -2691,12 +2663,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX1010-PAL-NEXT: scratch_store_dword v0, v2, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: s_endpgm ; @@ -2715,27 +2686,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1 -; GFX1030-PAL-NEXT: scratch_store_dword v0, v2, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1030-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -2922,15 +2890,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc +; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -2959,14 +2925,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX940-LABEL: store_load_large_imm_offset_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: s_movk_i32 s0, 0x3000 ; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 4 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -3019,15 +2984,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 +; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -3077,16 +3040,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-LABEL: store_load_large_imm_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3000 -; GFX11-NEXT: s_add_i32 s1, s32, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc +; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3110,15 +3070,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: s_movk_i32 s0, 0x3000 -; GFX940-NEXT: s_add_i32 s1, s32, 4 ; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, s1 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,16 +3099,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX11-PAL-NEXT: s_add_i32 s1, s32, 4 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 +; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: @@ -3168,17 +3123,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -3189,29 +3143,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 4 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX10-NEXT: v_add3_u32 v0, v1, v0, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v0, v1, v0, 0x400 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -3226,13 +3177,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x400, v0 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; @@ -3243,11 +3193,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -3263,29 +3212,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX10-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -3657,21 +3603,20 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX10-LABEL: store_load_i32_negative_unaligned: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: scratch_store_byte v0, v1, off +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_i32_negative_unaligned: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0 -; GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,24 +3642,34 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-PAL-LABEL: store_load_i32_negative_unaligned: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-PAL-NEXT: scratch_store_byte v0, v1, off -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] +; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0 -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off dlc +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off glc dlc +; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: @@ -3739,21 +3694,21 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX10-LABEL: store_load_i32_large_negative_unaligned: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffef7f, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: scratch_store_byte v0, v1, off +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_i32_large_negative_unaligned: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0 -; GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 +; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3779,24 +3734,35 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffef7f, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-PAL-NEXT: scratch_store_byte v0, v1, off -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] +; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0 -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off dlc +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 +; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off glc dlc +; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 9df4b903de4bda..e7d86c0c178e97 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1561,20 +1561,16 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 ; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 ; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 -; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32 ; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_add_i32 s1, s0, 0x70 ; GFX11-NEXT: s_add_i32 s2, s0, 0x60 ; GFX11-NEXT: s_add_i32 s3, s0, 0x50 -; GFX11-NEXT: s_add_i32 s4, s0, 64 -; GFX11-NEXT: s_add_i32 s5, s0, 48 -; GFX11-NEXT: s_add_i32 s6, s0, 32 -; GFX11-NEXT: s_add_i32 s7, s0, 16 -; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_add_i32 s4, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: s_waitcnt vmcnt(7) @@ -1582,17 +1578,17 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <33 x i32>, ptr addrspace(1) %ptr @@ -1854,20 +1850,16 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 ; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 ; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 -; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32 ; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_add_i32 s1, s0, 0x70 ; GFX11-NEXT: s_add_i32 s2, s0, 0x60 ; GFX11-NEXT: s_add_i32 s3, s0, 0x50 -; GFX11-NEXT: s_add_i32 s4, s0, 64 -; GFX11-NEXT: s_add_i32 s5, s0, 48 -; GFX11-NEXT: s_add_i32 s6, s0, 32 -; GFX11-NEXT: s_add_i32 s7, s0, 16 -; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_add_i32 s4, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: s_waitcnt vmcnt(7) @@ -1875,17 +1867,17 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr @@ -2160,7 +2152,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX11-NEXT: s_add_i32 s5, s0, 0xb0 ; GFX11-NEXT: s_add_i32 s6, s0, 0xa0 ; GFX11-NEXT: s_add_i32 s7, s0, 0x90 -; GFX11-NEXT: s_add_i32 s8, s0, 0x80 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: s_waitcnt vmcnt(7) @@ -2176,7 +2167,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b32 off, v33, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 0d54da3128a617..23502d1b36d182 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1990,135 +1990,140 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:1024 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:512 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:256 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:128 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:64 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 ; GFX11-NEXT: s_add_i32 s1, s0, 0x7f0 ; GFX11-NEXT: s_add_i32 s2, s0, 0x7e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: s_add_i32 s3, s0, 0x7d0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s3 -; GFX11-NEXT: s_add_i32 s1, s0, 0x7c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x7b0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x7d0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x7c0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x7a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x790 +; GFX11-NEXT: s_add_i32 s1, s0, 0x7b0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x7a0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x780 -; GFX11-NEXT: s_add_i32 s2, s0, 0x770 +; GFX11-NEXT: s_add_i32 s1, s0, 0x790 +; GFX11-NEXT: s_add_i32 s2, s0, 0x780 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x760 -; GFX11-NEXT: s_add_i32 s2, s0, 0x750 +; GFX11-NEXT: s_add_i32 s1, s0, 0x770 +; GFX11-NEXT: s_add_i32 s2, s0, 0x760 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x740 -; GFX11-NEXT: s_add_i32 s2, s0, 0x730 +; GFX11-NEXT: s_add_i32 s1, s0, 0x750 +; GFX11-NEXT: s_add_i32 s2, s0, 0x740 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x720 -; GFX11-NEXT: s_add_i32 s2, s0, 0x710 +; GFX11-NEXT: s_add_i32 s1, s0, 0x730 +; GFX11-NEXT: s_add_i32 s2, s0, 0x720 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x700 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6f0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x710 +; GFX11-NEXT: s_add_i32 s2, s0, 0x700 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6d0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6f0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6e0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6b0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6d0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6c0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x690 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6b0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6a0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x680 -; GFX11-NEXT: s_add_i32 s2, s0, 0x670 +; GFX11-NEXT: s_add_i32 s1, s0, 0x690 +; GFX11-NEXT: s_add_i32 s2, s0, 0x680 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x660 -; GFX11-NEXT: s_add_i32 s2, s0, 0x650 +; GFX11-NEXT: s_add_i32 s1, s0, 0x670 +; GFX11-NEXT: s_add_i32 s2, s0, 0x660 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x640 -; GFX11-NEXT: s_add_i32 s2, s0, 0x630 +; GFX11-NEXT: s_add_i32 s1, s0, 0x650 +; GFX11-NEXT: s_add_i32 s2, s0, 0x640 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x620 -; GFX11-NEXT: s_add_i32 s2, s0, 0x610 +; GFX11-NEXT: s_add_i32 s1, s0, 0x630 +; GFX11-NEXT: s_add_i32 s2, s0, 0x620 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x600 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5f0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x610 +; GFX11-NEXT: s_add_i32 s2, s0, 0x600 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5d0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5f0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5e0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5b0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5d0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5c0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x590 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5b0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5a0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x580 -; GFX11-NEXT: s_add_i32 s2, s0, 0x570 +; GFX11-NEXT: s_add_i32 s1, s0, 0x590 +; GFX11-NEXT: s_add_i32 s2, s0, 0x580 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x560 -; GFX11-NEXT: s_add_i32 s2, s0, 0x550 +; GFX11-NEXT: s_add_i32 s1, s0, 0x570 +; GFX11-NEXT: s_add_i32 s2, s0, 0x560 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x540 -; GFX11-NEXT: s_add_i32 s2, s0, 0x530 +; GFX11-NEXT: s_add_i32 s1, s0, 0x550 +; GFX11-NEXT: s_add_i32 s2, s0, 0x540 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x520 -; GFX11-NEXT: s_add_i32 s2, s0, 0x510 +; GFX11-NEXT: s_add_i32 s1, s0, 0x530 +; GFX11-NEXT: s_add_i32 s2, s0, 0x520 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x500 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4f0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x510 +; GFX11-NEXT: s_add_i32 s2, s0, 0x500 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4d0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4f0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4e0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4b0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4d0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4c0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x490 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4b0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4a0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x480 -; GFX11-NEXT: s_add_i32 s2, s0, 0x470 +; GFX11-NEXT: s_add_i32 s1, s0, 0x490 +; GFX11-NEXT: s_add_i32 s2, s0, 0x480 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x460 -; GFX11-NEXT: s_add_i32 s2, s0, 0x450 +; GFX11-NEXT: s_add_i32 s1, s0, 0x470 +; GFX11-NEXT: s_add_i32 s2, s0, 0x460 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x440 -; GFX11-NEXT: s_add_i32 s2, s0, 0x430 +; GFX11-NEXT: s_add_i32 s1, s0, 0x450 +; GFX11-NEXT: s_add_i32 s2, s0, 0x440 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x420 -; GFX11-NEXT: s_add_i32 s2, s0, 0x410 +; GFX11-NEXT: s_add_i32 s1, s0, 0x430 +; GFX11-NEXT: s_add_i32 s2, s0, 0x420 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x400 +; GFX11-NEXT: s_add_i32 s1, s0, 0x410 ; GFX11-NEXT: s_add_i32 s2, s0, 0x3f0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 @@ -2182,39 +2187,35 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: s_add_i32 s2, s0, 0x210 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x200 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1f0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1f0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1e0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1d0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1d0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1c0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1b0 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1b0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1a0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x190 +; GFX11-NEXT: s_add_i32 s1, s0, 0x190 +; GFX11-NEXT: s_add_i32 s2, s0, 0x180 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x180 -; GFX11-NEXT: s_add_i32 s2, s0, 0x170 +; GFX11-NEXT: s_add_i32 s1, s0, 0x170 +; GFX11-NEXT: s_add_i32 s2, s0, 0x160 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x160 -; GFX11-NEXT: s_add_i32 s2, s0, 0x150 +; GFX11-NEXT: s_add_i32 s1, s0, 0x150 +; GFX11-NEXT: s_add_i32 s2, s0, 0x140 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x140 -; GFX11-NEXT: s_add_i32 s2, s0, 0x130 +; GFX11-NEXT: s_add_i32 s1, s0, 0x130 +; GFX11-NEXT: s_add_i32 s2, s0, 0x120 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x120 -; GFX11-NEXT: s_add_i32 s2, s0, 0x110 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x100 +; GFX11-NEXT: s_add_i32 s1, s0, 0x110 ; GFX11-NEXT: s_add_i32 s2, s0, 0xf0 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 @@ -2230,20 +2231,12 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: s_add_i32 s2, s0, 0x90 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x80 -; GFX11-NEXT: s_add_i32 s2, s0, 0x70 +; GFX11-NEXT: s_add_i32 s1, s0, 0x70 +; GFX11-NEXT: s_add_i32 s2, s0, 0x60 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x60 -; GFX11-NEXT: s_add_i32 s2, s0, 0x50 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 64 -; GFX11-NEXT: s_add_i32 s2, s0, 48 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 32 -; GFX11-NEXT: s_add_i32 s0, s0, 16 +; GFX11-NEXT: s_add_i32 s1, s0, 0x50 +; GFX11-NEXT: s_add_i32 s0, s0, 48 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2643,141 +2636,122 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x10 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s32 offset:224 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s32 offset:240 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 -; GFX11-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23 -; GFX11-NEXT: v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21 +; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 +; GFX11-NEXT: s_clause 0x14 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:120 +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:136 +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v10, off, s32 offset:152 +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16 +; GFX11-NEXT: s_clause 0xd +; GFX11-NEXT: scratch_load_b32 v8, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v9, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; GFX11-NEXT: s_add_i32 s1, s0, 0x110 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19 -; GFX11-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17 -; GFX11-NEXT: v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15 -; GFX11-NEXT: v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13 -; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11 -; GFX11-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9 -; GFX11-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7 -; GFX11-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:104 -; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 -; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 -; GFX11-NEXT: s_add_i32 s34, s0, 0xc0 -; GFX11-NEXT: s_add_i32 s35, s0, 0xb0 -; GFX11-NEXT: s_add_i32 s36, s0, 0xa0 -; GFX11-NEXT: s_add_i32 s37, s0, 0x90 -; GFX11-NEXT: s_add_i32 s38, s0, 0x80 -; GFX11-NEXT: s_add_i32 s39, s0, 0x70 -; GFX11-NEXT: s_add_i32 s40, s0, 0x60 -; GFX11-NEXT: s_add_i32 s41, s0, 0x50 -; GFX11-NEXT: s_add_i32 s42, s0, 64 +; GFX11-NEXT: s_add_i32 s2, s0, 0x100 +; GFX11-NEXT: s_add_i32 s3, s0, 0xf0 +; GFX11-NEXT: s_add_i32 s34, s0, 0xe0 +; GFX11-NEXT: s_add_i32 s35, s0, 0xd0 +; GFX11-NEXT: s_add_i32 s36, s0, 0xc0 +; GFX11-NEXT: s_add_i32 s37, s0, 0xb0 +; GFX11-NEXT: s_add_i32 s38, s0, 0xa0 +; GFX11-NEXT: s_add_i32 s39, s0, 0x90 +; GFX11-NEXT: s_add_i32 s40, s0, 0x70 +; GFX11-NEXT: s_add_i32 s41, s0, 0x60 +; GFX11-NEXT: s_add_i32 s42, s0, 0x50 ; GFX11-NEXT: s_add_i32 s43, s0, 48 -; GFX11-NEXT: s_add_i32 s44, s0, 32 -; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: scratch_store_b128 off, v[45:48], s1 -; GFX11-NEXT: s_add_i32 s1, s0, 0x100 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:108 -; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: scratch_store_b128 off, v[56:59], s1 -; GFX11-NEXT: s_clause 0xc -; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 -; GFX11-NEXT: s_add_i32 s0, s0, 16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 -; GFX11-NEXT: scratch_store_b128 off, v[59:62], s2 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s3 -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s34 -; GFX11-NEXT: scratch_store_b128 off, v[37:40], s35 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s36 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s37 -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s38 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s39 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[60:63], s3 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s35 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[37:40], s36 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[33:36], s39 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s40 -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s41 -; GFX11-NEXT: scratch_store_b128 off, v[20:23], s42 -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s43 -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s44 -; GFX11-NEXT: scratch_store_b128 off, v[8:11], s0 -; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s40 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s41 +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s42 +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43 +; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:212 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ret <72 x i32> %val @@ -3334,12 +3308,12 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-LABEL: call_72xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s45, s33 +; GFX11-NEXT: s_mov_b32 s46, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3349,8 +3323,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:60 +; GFX11-NEXT: s_clause 0xe ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:56 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:52 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:48 @@ -3388,7 +3361,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v32, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0 @@ -3405,110 +3378,114 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 ; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 -; GFX11-NEXT: s_mov_b32 s47, return_72xi32@abs32@hi -; GFX11-NEXT: s_mov_b32 s46, return_72xi32@abs32@lo -; GFX11-NEXT: v_writelane_b32 v32, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xb -; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:624 -; GFX11-NEXT: scratch_load_b128 v[26:29], off, s33 offset:640 +; GFX11-NEXT: s_mov_b32 s45, return_72xi32@abs32@hi +; GFX11-NEXT: s_mov_b32 s44, return_72xi32@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 +; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 +; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v32, v48 +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[40:43], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[44:47], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:736 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:512 -; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5 -; GFX11-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27 +; GFX11-NEXT: v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:544 -; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29 -; GFX11-NEXT: v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55 -; GFX11-NEXT: v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1 -; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7 -; GFX11-NEXT: v_mov_b32_e32 v5, v8 -; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 +; GFX11-NEXT: v_mov_b32_e32 v56, v63 +; GFX11-NEXT: v_mov_b32_e32 v12, v15 +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_mov_b32_e32 v8, v15 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s32 -; GFX11-NEXT: v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53 -; GFX11-NEXT: v_mov_b32_e32 v39, v54 -; GFX11-NEXT: v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45 -; GFX11-NEXT: v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v9 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 +; GFX11-NEXT: v_mov_b32_e32 v32, v36 +; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 +; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51 +; GFX11-NEXT: v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41 +; GFX11-NEXT: v_mov_b32_e32 v50, v42 +; GFX11-NEXT: v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13 +; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9 ; GFX11-NEXT: scratch_store_b32 off, v11, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x90 -; GFX11-NEXT: v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42 -; GFX11-NEXT: v_mov_b32_e32 v52, v43 +; GFX11-NEXT: v_mov_b32_e32 v51, v43 +; GFX11-NEXT: v_mov_b32_e32 v41, v59 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 +; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 -; GFX11-NEXT: v_mov_b32_e32 v42, v57 +; GFX11-NEXT: v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 -; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20 ; GFX11-NEXT: s_add_i32 s0, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v43, v58 -; GFX11-NEXT: v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61 -; GFX11-NEXT: scratch_store_b128 off, v[44:47], s0 +; GFX11-NEXT: v_mov_b32_e32 v5, v16 +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x6c -; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14 +; GFX11-NEXT: v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 -; GFX11-NEXT: v_mov_b32_e32 v9, v16 +; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x50 -; GFX11-NEXT: v_mov_b32_e32 v11, v18 -; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 +; GFX11-NEXT: v_mov_b32_e32 v13, v24 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20 +; GFX11-NEXT: v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v14, v21 +; GFX11-NEXT: v_mov_b32_e32 v15, v26 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v23 +; GFX11-NEXT: v_mov_b32_e32 v16, v27 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s0 -; GFX11-NEXT: v_mov_b32_e32 v29, v33 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 +; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 -; GFX11-NEXT: v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] +; GFX11-NEXT: s_clause 0xe ; GFX11-NEXT: scratch_load_b32 v63, off, s33 ; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:8 @@ -3524,14 +3501,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:48 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:52 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:56 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:60 -; GFX11-NEXT: v_readlane_b32 s31, v32, 1 -; GFX11-NEXT: v_readlane_b32 s30, v32, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 -; GFX11-NEXT: s_mov_b32 s33, s45 +; GFX11-NEXT: s_mov_b32 s33, s46 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 6f419ab2cc67eb..133b8ec6a34d07 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -208,26 +208,20 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 ; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v6, 16, v0 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v10, 32, v0 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v14, 48, v0 ; GCN-SCRATCH-NEXT: s_clause 0x3 ; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v6, off -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[10:13], v10, off -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[14:17], v14, off +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v0, off offset:16 +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[10:13], v0, off offset:32 +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[14:17], v0, off offset:48 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v1, v18 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v1, 16, v0 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v18, 32, v0 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v19, 48, v0 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3) ; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[2:5], off ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v1, v[6:9], off +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v18, v[10:13], off +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v19, v[14:17], off +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 ; GCN-SCRATCH-NEXT: s_setpc_b64 s[30:31] bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()