diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index 6b56230a6e1d4..05490e6c81bc8 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -363,6 +363,17 @@ LLVM_ABI void updateProfileCallee( Function *Callee, int64_t EntryDelta, const ValueMap *VMap = nullptr); +/// Adds `!noalias` and `!alias.scope` metadata for `CB`'s called function's +/// `noalias` argument based memory accesses. +void addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, + const DataLayout &DL, AAResults *CalleeAAR, + ClonedCodeInfo &InlinedFunctionInfo, + bool UseNoAliasIntrinsic); + +/// Adds `!noalias` and `!alias.scope` metadata for `F`'s `noalias` argument +/// based memory accesses. +void addAliasScopeMetadata(Function &F); + /// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified /// basic blocks and extract their scope. These are candidates for duplication /// when cloning. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index dec781d71c54e..edd19e1ef1241 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Cloning.h" #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" @@ -86,6 +87,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; + + addAliasScopeMetadata(F); + for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -124,11 +128,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && !ST.hasUsableDSOffset()) continue; - - // FIXME: We can replace this with equivalent alias.scope/noalias - // metadata, but this appears to be a lot of work. - if (Arg.hasNoAliasAttr()) - continue; } auto *VT = dyn_cast(ArgTy); @@ -215,8 +214,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { } } - // TODO: Convert noalias arg to !noalias - if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 7df5e9958182c..377cdad452a2b 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -51,6 +51,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -1114,17 +1115,30 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart, /// then add new alias scopes for each noalias argument, tag the mapped noalias /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. -static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, - const DataLayout &DL, AAResults *CalleeAAR, - ClonedCodeInfo &InlinedFunctionInfo) { - if (!EnableNoAliasConversion) - return; - - const Function *CalledFunc = CB.getCalledFunction(); +static void addAliasScopeMetadataImpl(CallBase *CB, Function *F, + ValueToValueMapTy *VMap, + const DataLayout &DL, + AAResults *CalleeAAR, + ClonedCodeInfo *InlinedFunctionInfo, + bool UseNoAliasIntrinsic) { + assert(CB || F); + const Function *CalledFunc = CB ? CB->getCalledFunction() : F; SmallVector NoAliasArgs; + std::function paramHasAttr; + if (CB) { + paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool { + return CB->paramHasAttr(Arg->getArgNo(), Attr); + }; + + } else { + paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool { + return Arg->hasAttribute(Attr); + }; + } + for (const Argument &Arg : CalledFunc->args()) - if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty()) + if (paramHasAttr(&Arg, Attribute::NoAlias) && !Arg.use_empty()) NoAliasArgs.push_back(&Arg); if (NoAliasArgs.empty()) @@ -1166,29 +1180,20 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, NewScopes.insert(std::make_pair(A, NewScope)); if (UseNoAliasIntrinsic) { + assert(CB); // Introduce a llvm.experimental.noalias.scope.decl for the noalias // argument. MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope); auto *NoAliasDecl = - IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList); + IRBuilder<>(CB).CreateNoAliasScopeDeclaration(AScopeList); // Ignore the result for now. The result will be used when the // llvm.noalias intrinsic is introduced. (void)NoAliasDecl; } } - // Iterate over all new instructions in the map; for all memory-access - // instructions, add the alias scope metadata. - for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end(); - VMI != VMIE; ++VMI) { - if (const Instruction *I = dyn_cast(VMI->first)) { - if (!VMI->second) - continue; - - Instruction *NI = dyn_cast(VMI->second); - if (!NI || InlinedFunctionInfo.isSimplified(I, NI)) - continue; - + { + auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void { bool IsArgMemOnlyCall = false, IsFuncCall = false; SmallVector PtrArgs; @@ -1207,7 +1212,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, // know that about the inlined clone of this call site, and we don't // need to add metadata. if (Call->doesNotAccessMemory()) - continue; + return; IsFuncCall = true; if (CalleeAAR) { @@ -1215,7 +1220,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, // We'll retain this knowledge without additional metadata. if (ME.onlyAccessesInaccessibleMem()) - continue; + return; if (ME.onlyAccessesArgPointees()) IsArgMemOnlyCall = true; @@ -1237,7 +1242,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, // However, if this is a call, this we might just alias with none of the // noalias arguments. if (PtrArgs.empty() && !IsFuncCall) - continue; + return; // It is possible that there is only one underlying object, but you // need to go through several PHIs to see it, and thus could be @@ -1270,7 +1275,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, // completely describe the aliasing properties using alias.scope // metadata (and, thus, won't add any). if (const Argument *A = dyn_cast(V)) { - if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias)) + if (!paramHasAttr(A, Attribute::NoAlias)) UsesAliasingPtr = true; } else { UsesAliasingPtr = true; @@ -1292,7 +1297,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, // Nothing we can do if the used underlying object cannot be reliably // determined. if (UsesUnknownObject) - continue; + return; // A function call can always get captured noalias pointers (via other // parameters, globals, etc.). @@ -1353,10 +1358,49 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, LLVMContext::MD_alias_scope, MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope), MDNode::get(CalledFunc->getContext(), Scopes))); + }; + + if (VMap) { + assert(InlinedFunctionInfo); + + for (ValueToValueMapTy::iterator VMI = VMap->begin(), VMIE = VMap->end(); + VMI != VMIE; ++VMI) { + const Instruction *I = dyn_cast(VMI->first); + if (!I || !VMI->second) + continue; + + Instruction *NI = dyn_cast(VMI->second); + if (!NI || InlinedFunctionInfo->isSimplified(I, NI)) + continue; + + addAliasMD(I, NI); + } + + } else { + for (auto It = inst_begin(F), End = inst_end(F); It != End; ++It) { + Instruction *I = &(*It); + addAliasMD(I, I); + } } } } +void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, + const DataLayout &DL, AAResults *CalleeAAR, + ClonedCodeInfo &InlinedFunctionInfo, + bool UseNoAliasIntrinsic) { + addAliasScopeMetadataImpl(&CB, /* F */ nullptr, &VMap, DL, CalleeAAR, + &InlinedFunctionInfo, UseNoAliasIntrinsic); +} + +void llvm::addAliasScopeMetadata(Function &F) { + addAliasScopeMetadataImpl(/*CB=*/nullptr, &F, /*VMap=*/nullptr, + F.getParent()->getDataLayout(), + /*CalleeAAR=*/nullptr, + /*InlinedFunctionInfo=*/nullptr, + /*UseNoAliasIntrinsic=*/false); +} + static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin, ReturnInst *End) { @@ -2797,7 +2841,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, SAMetadataCloner.remap(FirstNewBlock, Caller->end()); // Add noalias metadata if necessary. - AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo); + if (EnableNoAliasConversion) + addAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo, + UseNoAliasIntrinsic); // Clone return attributes on the callsite into the calls within the inlined // function which feed into its return value. diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 99b7c7737f4ae..a87baca5a5878 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_zero_undef_i64_with_select: @@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_ffbh_u32_e32 v3, v3 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -706,21 +706,21 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_readfirstlane_b32 s2, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshl_b32 s2, s2, 8 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_flbit_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, s3, 32 +; VI-NEXT: flat_load_ubyte v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_readfirstlane_b32 s0, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s1, v3 +; VI-NEXT: s_lshl_b32 s0, s0, 8 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_flbit_i32_b32 s1, s1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cselect_b32 s0, s1, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -814,37 +814,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -965,29 +965,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(6) @@ -1001,19 +1002,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_ffbh_u32_e32 v4, v4 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v8 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 +; VI-NEXT: v_min_u32_e32 v2, v2, v4 +; VI-NEXT: v_min_u32_e32 v2, 64, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_zero_undef_i64_with_select: @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1259,10 +1259,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1505,11 +1505,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1584,11 +1584,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1661,11 +1661,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1858,13 +1858,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1942,13 +1942,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2026,13 +2026,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2111,13 +2111,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 73fddb53d1dcc..23d5cb73e8dd4 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -92,11 +92,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -168,8 +168,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -511,13 +511,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_ff1_i32_b64 s0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_zero_undef_i64_with_select: @@ -581,14 +581,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -677,17 +677,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_ffbl_b32_e32 v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -778,37 +778,37 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -929,55 +929,55 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: flat_load_ubyte v10, v[0:1] -; VI-NEXT: flat_load_ubyte v11, v[2:3] -; VI-NEXT: flat_load_ubyte v12, v[4:5] -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: flat_load_ubyte v12, v[0:1] +; VI-NEXT: flat_load_ubyte v13, v[2:3] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: flat_load_ubyte v6, v[8:9] +; VI-NEXT: flat_load_ubyte v7, v[10:11] ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_ffbl_b32_e32 v4, v4 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_ffbl_b32_e32 v3, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, v3, v2 +; VI-NEXT: v_min_u32_e32 v2, 64, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_zero_undef_i64_with_select: @@ -1091,36 +1091,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1213,36 +1213,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1338,39 +1338,39 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1455,11 +1455,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1541,19 +1541,19 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_ubyte v4, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 745e047348626..167fa469945a6 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1466,10 +1466,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -1485,15 +1487,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_mov_b32 s12, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 @@ -1515,10 +1517,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: flat_load_ubyte v4, v[0:1] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 @@ -1531,9 +1535,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_or_b32_e32 v5, v7, v3 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_perm_b32 v4, v4, v5, s8 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: v_perm_b32 v4, v4, v5, s12 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: @@ -1628,21 +1632,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1664,29 +1670,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1696,14 +1704,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u16_e32 v9, 9, v4 ; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 6507976872410..50e4fd5de14c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 -; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 -; GCN-NEXT: ds_load_b128 v[0:3], v17 +; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v48 +; GCN-NEXT: v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48 +; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 +; GCN-NEXT: ds_load_b128 v[0:3], v32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 @@ -194,66 +194,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] +; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:2064 +; GCN-NEXT: ds_load_b128 v[0:3], v32 offset:2048 +; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 +; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 +; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 +; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:16 -; GCN-NEXT: ds_store_b128 v16, v[8:11] -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:2064 -; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:2048 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38 +; GCN-NEXT: v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36 +; GCN-NEXT: v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34 +; GCN-NEXT: v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32 +; GCN-NEXT: v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28 +; GCN-NEXT: v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47] +; GCN-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6 +; GCN-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4 +; GCN-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2 +; GCN-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 +; GCN-NEXT: v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39] +; GCN-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17 +; GCN-NEXT: v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19 +; GCN-NEXT: v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21 +; GCN-NEXT: v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23 +; GCN-NEXT: ds_store_b128 v56, v[12:15] offset:16 +; GCN-NEXT: ds_store_b128 v56, v[8:11] +; GCN-NEXT: ds_store_b128 v57, v[36:39] offset:2064 +; GCN-NEXT: ds_store_b128 v57, v[32:35] offset:2048 +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2064 -; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:2048 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:6160 -; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144 +; GCN-NEXT: ds_store_b128 v57, v[4:7] offset:4112 +; GCN-NEXT: ds_store_b128 v57, v[0:3] offset:4096 +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:4112 -; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:4096 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:12304 -; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288 +; GCN-NEXT: ds_store_b128 v57, v[52:55] offset:6160 +; GCN-NEXT: ds_store_b128 v57, v[48:51] offset:6144 +; GCN-NEXT: ds_store_b128 v57, v[44:47] offset:8208 +; GCN-NEXT: ds_store_b128 v57, v[40:43] offset:8192 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:6160 -; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:6144 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:20496 -; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:8208 -; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:8192 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: s_endpgm ; @@ -262,12 +257,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v48, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v48 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 @@ -276,66 +271,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:2064 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 offset:2048 +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 +; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 +; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 +; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:16 -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:2064 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:2048 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_3) +; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47] +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39] +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23 +; EXACTCUTOFF-NEXT: ds_store_b128 v56, v[12:15] offset:16 +; EXACTCUTOFF-NEXT: ds_store_b128 v56, v[8:11] +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[36:39] offset:2064 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[32:35] offset:2048 +; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2064 -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:2048 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:6160 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[4:7] offset:4112 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[0:3] offset:4096 +; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:4112 -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:4096 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:12304 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[52:55] offset:6160 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[48:51] offset:6144 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[44:47] offset:8208 +; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[40:43] offset:8192 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:6160 -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:6144 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:20496 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:8208 -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:8192 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index 371b4f070094d..f4c21b01bf7f6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -9,265 +9,957 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v40, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v39, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48 +; GCN-MINREG-NEXT: v_add_u32_e32 v5, s0, v0 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v5 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v5 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v5 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v5 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v5 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v5 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v5 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v5 offset:48 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 -; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31] +; GCN-MINREG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-MINREG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-MINREG-NEXT: ds_read_b128 v[6:9], v5 offset:8192 +; GCN-MINREG-NEXT: s_mov_b32 s14, -1 +; GCN-MINREG-NEXT: s_mov_b32 s15, 0xe00000 +; GCN-MINREG-NEXT: s_add_u32 s12, s12, s11 +; GCN-MINREG-NEXT: s_addc_u32 s13, s13, 0 +; GCN-MINREG-NEXT: ds_read_b128 v[34:37], v5 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 v[30:33], v5 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 v[26:29], v5 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 v[22:25], v5 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 v[18:21], v5 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 v[14:17], v5 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 v[10:13], v5 offset:8208 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(7) +; GCN-MINREG-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v5 +; GCN-MINREG-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(6) +; GCN-MINREG-NEXT: buffer_store_dword v37, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: ds_read_b128 v[6:9], v5 offset:24576 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(1) +; GCN-MINREG-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v36, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: ds_read_b128 v[34:37], v5 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 v[30:33], v5 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 v[26:29], v5 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 v[22:25], v5 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 v[18:21], v5 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 v[14:17], v5 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 v[10:13], v5 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v5 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v5 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v5 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v5 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v5 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v5 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v5 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v5 offset:49152 +; GCN-MINREG-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: v_add_u32_e32 v41, s1, v0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(14) +; GCN-MINREG-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v63, a10 ; Reload Reuse +; GCN-MINREG-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a4, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a5, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a6, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a7, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a8, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword a9, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v62, a11 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v61, a12 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v57, a16 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v56, a17 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v55, a18 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v54, a19 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v53, a20 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v52, a21 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v51, a22 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v50, a23 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v49, a24 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v48, a25 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v47, a26 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v46, a27 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v45, a28 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v44, a29 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v43, a30 ; Reload Reuse +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v42, a31 ; Reload Reuse +; GCN-MINREG-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(8) +; GCN-MINREG-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v37, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MINREG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: v_mov_b32_e32 v38, s1 +; GCN-MINREG-NEXT: v_mov_b32_e32 v10, v63 +; GCN-MINREG-NEXT: v_mov_b32_e32 v11, v62 +; GCN-MINREG-NEXT: v_mov_b32_e32 v12, v61 +; GCN-MINREG-NEXT: v_mov_b32_e32 v13, v60 +; GCN-MINREG-NEXT: v_mov_b32_e32 v14, v59 +; GCN-MINREG-NEXT: v_mov_b32_e32 v15, v58 +; GCN-MINREG-NEXT: v_mov_b32_e32 v16, v57 +; GCN-MINREG-NEXT: v_mov_b32_e32 v17, v56 +; GCN-MINREG-NEXT: v_mov_b32_e32 v18, v55 +; GCN-MINREG-NEXT: v_mov_b32_e32 v19, v54 +; GCN-MINREG-NEXT: v_mov_b32_e32 v20, v53 +; GCN-MINREG-NEXT: v_mov_b32_e32 v21, v52 +; GCN-MINREG-NEXT: v_mov_b32_e32 v22, v51 +; GCN-MINREG-NEXT: v_mov_b32_e32 v23, v50 +; GCN-MINREG-NEXT: v_mov_b32_e32 v24, v49 +; GCN-MINREG-NEXT: v_mov_b32_e32 v25, v48 +; GCN-MINREG-NEXT: v_mov_b32_e32 v26, v47 +; GCN-MINREG-NEXT: v_mov_b32_e32 v27, v46 +; GCN-MINREG-NEXT: v_mov_b32_e32 v28, v45 +; GCN-MINREG-NEXT: v_mov_b32_e32 v29, v44 +; GCN-MINREG-NEXT: v_mov_b32_e32 v30, v43 +; GCN-MINREG-NEXT: s_waitcnt vmcnt(0) +; GCN-MINREG-NEXT: v_mov_b32_e32 v31, v42 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[12:15] offset:48 +; GCN-MINREG-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill +; GCN-MINREG-NEXT: buffer_store_dword v36, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3] -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ds_write_b128 v41, v[0:3] +; GCN-MINREG-NEXT: ds_write_b128 v41, v[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v41, v[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[56:59] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[60:63] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[48:51] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[52:55] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[40:43] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[44:47] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[32:35] offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[36:39] offset:24592 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(14) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31] +; GCN-MINREG-NEXT: buffer_load_dword a32, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a33, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a34, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a35, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a36, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a37, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a38, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a39, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a40, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a41, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a42, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a43, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a44, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a45, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a46, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a47, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a48, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a49, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a50, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: ds_write_b128 v38, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[4:7] offset:32784 +; GCN-MINREG-NEXT: buffer_load_dword a0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a51, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a52, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a53, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a54, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a55, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a56, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a57, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a58, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a59, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a60, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a61, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a62, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: buffer_load_dword a63, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload +; GCN-MINREG-NEXT: s_waitcnt vmcnt(13) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_nop 7 ; GCN-MINREG-NEXT: s_nop 7 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v33, a31 +; GCN-MINREG-NEXT: s_waitcnt vmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63] +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v32, a30 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v31, a29 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v30, a28 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v29, a27 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v28, a26 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v27, a25 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v26, a24 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v25, a23 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v24, a22 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v23, a21 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v22, a20 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v21, a19 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v20, a18 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v19, a17 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v18, a16 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v17, a15 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v16, a14 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v15, a13 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v14, a12 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v13, a11 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v12, a10 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v11, a9 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v10, a8 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v9, a7 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v8, a6 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v7, a5 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v6, a4 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v5, a3 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v4, a2 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v3, a1 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v2, a0 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[56:59] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[60:63] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[48:51] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[52:55] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[40:43] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[44:47] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[32:35] offset:16384 +; GCN-MINREG-NEXT: ds_write_b128 v38, a[36:39] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[26:29] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[30:33] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[18:21] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[22:25] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[10:13] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[14:17] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[2:5] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v38, v[6:9] offset:8208 ; GCN-MINREG-NEXT: s_endpgm ; ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-MAXOCC-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-MAXOCC-NEXT: ds_read_b128 v[2:5], v1 +; GCN-MAXOCC-NEXT: ds_read_b128 v[30:33], v1 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 v[26:29], v1 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 v[22:25], v1 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 v[18:21], v1 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 v[6:9], v1 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 v[10:13], v1 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 v[14:17], v1 offset:48 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a0, v2 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a1, v3 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a2, v4 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a3, v5 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a4, v6 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a5, v7 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a6, v8 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a7, v9 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a8, v10 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a9, v11 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a10, v12 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a11, v13 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a12, v14 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a13, v15 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a14, v16 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a15, v17 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a16, v18 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a17, v19 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a18, v20 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a19, v21 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a20, v22 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a21, v23 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a22, v24 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a23, v25 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a24, v26 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a25, v27 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a26, v28 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a27, v29 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a28, v30 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a29, v31 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a30, v32 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a31, v33 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: s_mov_b32 s14, -1 +; GCN-MAXOCC-NEXT: s_mov_b32 s15, 0xe00000 ; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-MAXOCC-NEXT: s_add_u32 s12, s12, s11 +; GCN-MAXOCC-NEXT: s_addc_u32 s13, s13, 0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v4, v3 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, v2 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_nop 5 +; GCN-MAXOCC-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v63, a4 ; Reload Reuse +; GCN-MAXOCC-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v62, a5 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v61, a6 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v60, a7 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v59, a8 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v58, a9 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v55, a12 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v54, a13 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v53, a14 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v52, a15 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v51, a16 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v50, a17 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v49, a18 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v48, a19 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v47, a20 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v46, a21 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v45, a22 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v44, a23 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v43, a24 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v42, a25 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v41, a26 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v40, a27 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v39, a28 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v38, a29 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v37, a30 ; Reload Reuse +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v36, a31 ; Reload Reuse +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v1 offset:8192 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1 -; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v4, a[0:31] ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: buffer_store_dword a0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: s_nop 0 +; GCN-MAXOCC-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v1 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v1 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v1 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v1 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v1 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v1 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v1 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v1 offset:49152 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v2 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 v[32:35], v2 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v2 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v2 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v2 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v2 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v2 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v2 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(7) +; GCN-MAXOCC-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: s_nop 0 +; GCN-MAXOCC-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(2) +; GCN-MAXOCC-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(1) +; GCN-MAXOCC-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s1, v0 +; GCN-MAXOCC-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v4, v63 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v5, v62 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v6, v61 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v7, v60 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v8, v59 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v9, v58 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v10, v57 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v11, v56 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v12, v55 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v13, v54 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v14, v53 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v15, v52 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v16, v51 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v17, v50 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v18, v49 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v19, v48 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v20, v47 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v21, v46 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v22, v45 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v23, v44 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v24, v43 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v25, v42 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v26, v41 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v27, v40 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v28, v39 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v29, v38 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v30, v37 +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(0) +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v31, v36 +; GCN-MAXOCC-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[4:7] offset:16 +; GCN-MAXOCC-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill +; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[0:3] +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v0, s1 +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[26:29] offset:8288 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[30:33] offset:8304 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[18:21] offset:8256 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[22:25] offset:8272 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[10:13] offset:8224 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[14:17] offset:8240 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(28) +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[2:5] offset:8192 +; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(0) +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: buffer_load_dword a0, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(10) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 -; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[6:9] offset:8208 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 -; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: s_endpgm ; @@ -275,11 +967,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 ; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 ; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 @@ -289,119 +981,355 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 ; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-ILP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v3 offset:8192 +; GCN-ILP-NEXT: s_mov_b32 s14, -1 +; GCN-ILP-NEXT: s_mov_b32 s15, 0xe00000 +; GCN-ILP-NEXT: s_add_u32 s12, s12, s11 +; GCN-ILP-NEXT: s_addc_u32 s13, s13, 0 +; GCN-ILP-NEXT: ds_read_b128 v[32:35], v3 offset:8304 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v3 offset:8288 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v3 offset:8272 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v3 offset:8256 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v3 offset:8240 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v3 offset:8224 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v3 offset:8208 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(7) +; GCN-ILP-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 -; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill +; GCN-ILP-NEXT: v_accvgpr_read_b32 v63, a4 ; Reload Reuse +; GCN-ILP-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill +; GCN-ILP-NEXT: ds_read_b128 v[32:35], v3 offset:24688 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3 offset:49152 ; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-ILP-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill +; GCN-ILP-NEXT: v_accvgpr_read_b32 v62, a5 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v61, a6 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v60, a7 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v59, a8 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v58, a9 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v55, a12 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v54, a13 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v53, a14 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v52, a15 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v51, a16 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v50, a17 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v49, a18 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v48, a19 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v47, a20 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v46, a21 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v45, a22 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v44, a23 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v43, a24 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v42, a25 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v41, a26 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v40, a27 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v39, a28 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v38, a29 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v37, a30 ; Reload Reuse +; GCN-ILP-NEXT: v_accvgpr_read_b32 v36, a31 ; Reload Reuse ; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608 ; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 ; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 ; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v2 +; GCN-ILP-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill +; GCN-ILP-NEXT: s_nop 0 +; GCN-ILP-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 +; GCN-ILP-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload +; GCN-ILP-NEXT: v_mov_b32_e32 v4, v63 +; GCN-ILP-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload +; GCN-ILP-NEXT: v_mov_b32_e32 v5, v62 +; GCN-ILP-NEXT: buffer_load_dword v32, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-ILP-NEXT: v_mov_b32_e32 v6, v61 +; GCN-ILP-NEXT: v_mov_b32_e32 v7, v60 +; GCN-ILP-NEXT: v_mov_b32_e32 v8, v59 +; GCN-ILP-NEXT: v_mov_b32_e32 v9, v58 +; GCN-ILP-NEXT: v_mov_b32_e32 v10, v57 +; GCN-ILP-NEXT: v_mov_b32_e32 v11, v56 +; GCN-ILP-NEXT: v_mov_b32_e32 v12, v55 +; GCN-ILP-NEXT: v_mov_b32_e32 v13, v54 +; GCN-ILP-NEXT: v_mov_b32_e32 v14, v53 +; GCN-ILP-NEXT: v_mov_b32_e32 v15, v52 +; GCN-ILP-NEXT: v_mov_b32_e32 v16, v51 +; GCN-ILP-NEXT: v_mov_b32_e32 v17, v50 +; GCN-ILP-NEXT: v_mov_b32_e32 v18, v49 +; GCN-ILP-NEXT: v_mov_b32_e32 v19, v48 +; GCN-ILP-NEXT: v_mov_b32_e32 v20, v47 +; GCN-ILP-NEXT: v_mov_b32_e32 v21, v46 +; GCN-ILP-NEXT: v_mov_b32_e32 v22, v45 +; GCN-ILP-NEXT: v_mov_b32_e32 v23, v44 +; GCN-ILP-NEXT: v_mov_b32_e32 v24, v43 +; GCN-ILP-NEXT: v_mov_b32_e32 v25, v42 +; GCN-ILP-NEXT: v_mov_b32_e32 v26, v41 +; GCN-ILP-NEXT: v_mov_b32_e32 v27, v40 +; GCN-ILP-NEXT: v_mov_b32_e32 v28, v39 +; GCN-ILP-NEXT: v_mov_b32_e32 v29, v38 +; GCN-ILP-NEXT: v_mov_b32_e32 v30, v37 +; GCN-ILP-NEXT: s_waitcnt vmcnt(1) +; GCN-ILP-NEXT: v_mov_b32_e32 v31, v36 +; GCN-ILP-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill +; GCN-ILP-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill +; GCN-ILP-NEXT: s_waitcnt vmcnt(2) +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] +; GCN-ILP-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-ILP-NEXT: s_waitcnt vmcnt(1) +; GCN-ILP-NEXT: ds_write_b128 v0, v[4:7] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v0, v[8:11] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v0, v[12:15] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v0, v[16:19] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v0, v[20:23] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v0, v[24:27] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v0, v[28:31] offset:112 +; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] +; GCN-ILP-NEXT: s_waitcnt vmcnt(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:24672 +; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:24688 +; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:24640 +; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:24656 +; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:24608 +; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:24624 +; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:24576 +; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:24592 +; GCN-ILP-NEXT: buffer_load_dword a32, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a33, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a34, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a35, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a36, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a37, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a38, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a39, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a40, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a41, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a42, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a43, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a44, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a45, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a46, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a47, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a48, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a49, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a50, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a51, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a52, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a53, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a54, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a55, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a56, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a57, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a58, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a59, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a60, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a61, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a62, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a63, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload +; GCN-ILP-NEXT: s_waitcnt vmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-ILP-NEXT: buffer_load_dword a0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GCN-ILP-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload +; GCN-ILP-NEXT: s_waitcnt vmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: s_nop 7 ; GCN-ILP-NEXT: s_nop 7 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v35, a31 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v34, a30 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v33, a29 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v32, a28 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v31, a27 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v30, a26 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v29, a25 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v28, a24 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v27, a23 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v26, a22 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v25, a21 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v24, a20 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v23, a19 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v22, a18 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v21, a17 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v20, a16 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v19, a15 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v18, a14 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v17, a13 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v16, a12 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v15, a11 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v14, a10 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v13, a9 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v12, a8 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v11, a7 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v10, a6 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v9, a5 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v8, a4 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v7, a3 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v6, a2 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v5, a1 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v4, a0 +; GCN-ILP-NEXT: ds_write_b128 v2, v[28:31] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, v[32:35] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v2, v[20:23] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, v[24:27] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, v[12:15] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, v[16:19] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, v[4:7] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, v[8:11] offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:16384 +; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:16400 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: s_endpgm entry: @@ -485,12 +1413,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v3 offset:8192 ; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63] ; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80 @@ -499,31 +1435,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[56:59] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[60:63] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[48:51] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[52:55] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[40:43] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[44:47] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[32:35] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[36:39] offset:8208 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 1 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000) ; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688 @@ -536,44 +1460,54 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v3 offset:49152 ; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 1 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63] ; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v37, a31 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v36, a30 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v35, a29 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v34, a28 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v33, a27 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v32, a26 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v31, a25 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v30, a24 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v29, a23 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v28, a22 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v27, a21 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v26, a20 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v25, a19 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v24, a18 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v23, a17 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v22, a16 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v21, a15 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v20, a14 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v19, a13 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v18, a12 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v17, a11 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v16, a10 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v15, a9 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v14, a8 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v13, a7 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v12, a6 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v11, a5 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v10, a4 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v9, a3 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v8, a2 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v7, a1 +; GCN-MINREG-NEXT: v_accvgpr_read_b32 v6, a0 ; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 ; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 ; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 @@ -582,12 +1516,28 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 ; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 ; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 -; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: ds_write_b128 v2, a[60:63] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[56:59] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[52:55] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[48:51] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[44:47] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[40:43] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[36:39] offset:24592 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[32:35] offset:24576 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(8) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[34:37] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[30:33] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[26:29] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[22:25] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[18:21] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[14:17] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[10:13] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v2, v[6:9] offset:16384 ; GCN-MINREG-NEXT: s_nop 7 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 @@ -605,134 +1555,210 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v3 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v3 offset:8192 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v2, s1, v2 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] ; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[0:3] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, s1 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 -; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[56:59] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[60:63] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[48:51] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[52:55] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[40:43] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[44:47] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[32:35] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[36:39] offset:8208 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000) -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v3 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 v[32:35], v3 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v3 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v3 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v3 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v3 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v3 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v3 offset:24624 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a0, v4 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a1, v5 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a2, v6 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a3, v7 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a4, v8 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a5, v9 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a6, v10 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a7, v11 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a8, v12 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a9, v13 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a10, v14 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a11, v15 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a12, v16 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a13, v17 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a14, v18 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a15, v19 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a16, v20 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a17, v21 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a18, v22 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a19, v23 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a20, v24 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a21, v25 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a22, v26 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a23, v27 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a24, v28 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a25, v29 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a26, v30 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a27, v31 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a28, v32 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a29, v33 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a30, v34 +; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a31, v35 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 0 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v3 offset:49152 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 -; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v37, a31 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v35, a29 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v34, a28 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v33, a27 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v32, a26 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v31, a25 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v30, a24 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v29, a23 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v28, a22 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v27, a21 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v26, a20 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v25, a19 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v24, a18 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v23, a17 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v22, a16 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v21, a15 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v20, a14 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v19, a13 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v18, a12 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v17, a11 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v16, a10 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v15, a9 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v14, a8 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v13, a7 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v12, a6 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v11, a5 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v10, a4 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v9, a3 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v8, a2 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v7, a1 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v6, a0 +; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v36, a30 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[8:9], v[10:11], v[10:11] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[10:11], v[12:13], v[12:13] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[12:13], v[14:15], v[14:15] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[14:15], v[16:17], v[16:17] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[16:17], v[18:19], v[18:19] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[18:19], v[20:21], v[20:21] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[20:21], v[22:23], v[22:23] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[22:23], v[24:25], v[24:25] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[24:25], v[26:27], v[26:27] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[26:27], v[28:29], v[28:29] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[28:29], v[30:31], v[30:31] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[30:31], v[32:33], v[32:33] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[32:33], v[34:35], v[34:35] op_sel:[0,1] +; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[34:35], v[36:37], v[36:37] op_sel:[0,1] +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[32:35] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[28:31] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[24:27] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[20:23] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[16:19] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[12:15] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[8:11] offset:16400 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[4:7] offset:16384 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(14) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 -; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[60:63] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[56:59] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[52:55] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[48:51] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[44:47] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[40:43] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[36:39] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[32:35] offset:24576 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(14) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 7 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[0:3] offset:32768 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: s_endpgm ; @@ -745,47 +1771,40 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3 +; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:112 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] ; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16 ; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32 ; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48 ; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64 ; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80 ; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96 ; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112 ; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v4, s1, v2 ; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: s_nop 6 +; GCN-ILP-NEXT: ds_write_b128 v4, a[32:35] +; GCN-ILP-NEXT: ds_write_b128 v4, a[36:39] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v4, a[40:43] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v4, a[44:47] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v4, a[48:51] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v4, a[52:55] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v4, a[56:59] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v4, a[60:63] offset:112 +; GCN-ILP-NEXT: s_nop 3 ; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 ; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 ; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256 @@ -795,6 +1814,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192 ; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000) ; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 ; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 @@ -806,53 +1828,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3 offset:49152 +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63] ; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v35, a31 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v34, a30 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v33, a29 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v32, a28 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v31, a27 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v30, a26 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v29, a25 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v28, a24 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v27, a23 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v26, a22 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v25, a21 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v24, a20 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v23, a19 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v22, a18 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v21, a17 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v20, a16 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v19, a15 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v18, a14 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v17, a13 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v16, a12 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v15, a11 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v14, a10 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v13, a9 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v12, a8 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v11, a7 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v10, a6 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v9, a5 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v8, a4 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v7, a3 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v6, a2 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v5, a1 +; GCN-ILP-NEXT: v_accvgpr_read_b32 v4, a0 ; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 ; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656 ; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672 ; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688 ; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 -; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:24688 +; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:24672 +; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:24656 +; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:24640 +; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:24624 +; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:24608 +; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:24592 +; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:24576 +; GCN-ILP-NEXT: ds_write_b128 v2, v[32:35] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v2, v[28:31] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v2, v[24:27] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v2, v[20:23] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v2, v[16:19] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v2, v[12:15] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v2, v[8:11] offset:16400 +; GCN-ILP-NEXT: ds_write_b128 v2, v[4:7] offset:16384 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(14) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: s_nop 7 @@ -867,8 +1918,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 ; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 73586b1243376..266df5d56b5c2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -387,88 +387,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 +; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 +; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 +; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 +; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 +; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mul_lo_u32 v3, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v2, v30, v30 +; GCN-NEXT: v_mul_lo_u32 v1, v29, v29 +; GCN-NEXT: v_mul_lo_u32 v0, v28, v28 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:16 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_lo_u32 v1, v25, v25 +; GCN-NEXT: v_mul_lo_u32 v0, v24, v24 +; GCN-NEXT: v_mul_lo_u32 v3, v27, v27 +; GCN-NEXT: v_mul_lo_u32 v2, v26, v26 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; @@ -476,88 +475,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF: ; %bb.0: ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(4) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(5) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(4) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(3) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: s_nop 0 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v30, v30 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v29, v29 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v28, v28 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v25, v25 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v24, v24 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v27, v27 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v26, v26 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -887,12 +885,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; GCN-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 @@ -901,104 +931,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; @@ -1021,12 +1011,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v1 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 @@ -1035,104 +1057,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; EXACTCUTOFF-NEXT: s_nop 2 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; EXACTCUTOFF-NEXT: s_nop 2 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll index 190384255bf23..efece9d02950d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1119,21 +1119,44 @@ define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 % } define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 { -; GCN-LABEL: @kern_noalias_global_ptr( -; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: ret void +; HSA-LABEL: @kern_noalias_global_ptr( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]] +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]] +; MESA-NEXT: ret void ; store volatile ptr addrspace(1) %ptr, ptr addrspace(1) poison ret void } define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 { -; GCN-LABEL: @kern_noalias_global_ptr_x2( -; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: ret void +; HSA-LABEL: @kern_noalias_global_ptr_x2( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; HSA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]] +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr_x2( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]] +; MESA-NEXT: ret void ; store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) poison store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) poison @@ -1855,10 +1878,24 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; HSA: [[META2]] = !{i64 42} ; HSA: [[META3]] = !{i64 128} ; HSA: [[META4]] = !{i64 1024} +; HSA: [[META5]] = !{[[META6:![0-9]+]]} +; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} +; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} +; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} +; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} ;. ; MESA: [[META0]] = !{} ; MESA: [[RNG1]] = !{i32 0, i32 8} ; MESA: [[META2]] = !{i64 42} ; MESA: [[META3]] = !{i64 128} ; MESA: [[META4]] = !{i64 1024} +; MESA: [[META5]] = !{[[META6:![0-9]+]]} +; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} +; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} +; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} +; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} ;. diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 0ac3d652050d3..ba59b94b6d141 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -7,26 +7,44 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 -; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 -; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -56,15 +74,26 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_and_b32 s5, s1, 0xffff ; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: s_and_b32 s5, s3, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SDAG-NEXT: s_and_b32 s1, s3, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s2 +; SDAG-NEXT: s_mov_b32 s2, s6 +; SDAG-NEXT: s_mov_b32 s3, s7 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 -; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: @@ -72,18 +101,29 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: s_mov_b32 s7, 0 ; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_mov_b32 s10, s6 +; GISEL-NEXT: s_mov_b32 s11, s7 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_and_b32 s5, s1, 0xffff ; GISEL-NEXT: s_mov_b32 s4, s0 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GISEL-NEXT: s_and_b32 s5, s3, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GISEL-NEXT: s_and_b32 s9, s3, 0xffff +; GISEL-NEXT: s_mov_b32 s8, s2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 -; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i32 16, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index ec065b4daa376..73438a7462531 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -647,13 +647,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sub_u32 s0, s0, s2 -; GFX6-NEXT: s_subb_u32 s1, s1, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_sub_u32 s0, s2, s8 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_subb_u32 s1, s3, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -661,41 +663,41 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s0, s0, s2 -; GFX8-NEXT: s_subb_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_sub_u32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_subb_u32 s1, s3, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s0, s2 -; GFX9-NEXT: s_subb_u32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_sub_u32 s2, s2, s6 +; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 @@ -740,12 +742,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_sub_i64: @@ -832,14 +834,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_sub_v2i64: