diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 5633d4c4254c..26cb6439c9e2 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -37,13 +37,34 @@ // multiple scalar registers, similar to a GPU vectorized load. In theory ARM // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +// +// Overview of the algorithm and terminology in this pass: +// +// - Break up each basic block into pseudo-BBs, composed of instructions which +// are guaranteed to transfer control to their successors. +// - Within a single pseudo-BB, find all loads, and group them into +// "equivalence classes" according to getUnderlyingObject() and loaded +// element size. Do the same for stores. +// - For each equivalence class, greedily build "chains". Each chain has a +// leader instruction, and every other member of the chain has a known +// constant offset from the first instr in the chain. +// - Break up chains so that they contain only contiguous accesses of legal +// size with no intervening may-alias instrs. +// - Convert each chain to vector instructions. +// +// The O(n^2) behavior of this pass comes from initially building the chains. +// In the worst case we have to compare each new instruction to all of those +// that came before. To limit this, we only calculate the offset to the leaders +// of the N most recently-used chains. #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -57,6 +78,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -67,22 +89,33 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/ModRef.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Vectorize.h" #include #include +#include #include +#include +#include +#include +#include #include +#include #include +#include using namespace llvm; @@ -91,21 +124,114 @@ using namespace llvm; STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); +namespace { + +// Equivalence class key, the initial tuple by which we group loads/stores. +// Loads/stores with different EqClassKeys are never merged. +// +// (We could in theory remove element-size from the this tuple. We'd just need +// to fix up the vector packing/unpacking code.) +using EqClassKey = + std::tuple; +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const EqClassKey &K) { + const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K; + return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject + << " of element size " << ElementSize << " bits in addrspace " + << AddrSpace; +} + +// A Chain is a set of instructions such that: +// - All instructions have the same equivalence class, so in particular all are +// loads, or all are stores. +// - We know the address accessed by the i'th chain elem relative to the +// chain's leader instruction, which is the first instr of the chain in BB +// order. +// +// Chains have two canonical orderings: +// - BB order, sorted by Instr->comesBefore. +// - Offset order, sorted by OffsetFromLeader. +// This pass switches back and forth between these orders. +struct ChainElem { + Instruction *Inst; + APInt OffsetFromLeader; +}; +using Chain = SmallVector; + +void sortChainInBBOrder(Chain &C) { + sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); }); +} + +void sortChainInOffsetOrder(Chain &C) { + sort(C, [](const auto &A, const auto &B) { + if (A.OffsetFromLeader != B.OffsetFromLeader) + return A.OffsetFromLeader.slt(B.OffsetFromLeader); + return A.Inst->comesBefore(B.Inst); // stable tiebreaker + }); +} + +void dumpChain(ArrayRef C) { + for (const auto &E : C) { + dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n"; + } +} + +using EquivalenceClassMap = + MapVector>; + // FIXME: Assuming stack alignment of 4 is always good enough -static const unsigned StackAdjustedAlignment = 4; +constexpr unsigned StackAdjustedAlignment = 4; -namespace { +Instruction *propagateMetadata(Instruction *I, const Chain &C) { + SmallVector Values; + for (const ChainElem &E : C) + Values.push_back(E.Inst); + return propagateMetadata(I, Values); +} + +bool isInvariantLoad(const Instruction *I) { + const LoadInst *LI = dyn_cast(I); + return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load); +} + +/// Reorders the instructions that I depends on (the instructions defining its +/// operands), to ensure they dominate I. +void reorder(Instruction *I) { + SmallPtrSet InstructionsToMove; + SmallVector Worklist; + + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *IW = Worklist.pop_back_val(); + int NumOperands = IW->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast(IW->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; + + // If IM is in another BB, no need to move it, because this pass only + // vectorizes instructions within one BB. + if (IM->getParent() != I->getParent()) + continue; + + if (!IM->comesBefore(I)) { + InstructionsToMove.insert(IM); + Worklist.push_back(IM); + } + } + } -/// ChainID is an arbitrary token that is allowed to be different only for the -/// accesses that are guaranteed to be considered non-consecutive by -/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions -/// together and reducing the number of instructions the main search operates on -/// at a time, i.e. this is to reduce compile time and nothing else as the main -/// search has O(n^2) time complexity. The underlying type of ChainID should not -/// be relied upon. -using ChainID = const Value *; -using InstrList = SmallVector; -using InstrListMap = MapVector; + // All instructions to move should follow I. Start from I, not from begin(). + for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) { + Instruction *IM = &*(BBI++); + if (!InstructionsToMove.count(IM)) + continue; + IM->moveBefore(I); + } +} class Vectorizer { Function &F; @@ -117,6 +243,12 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; + // We could erase instrs right after vectorizing them, but that can mess up + // our BB iterators, and also can make the equivalence class keys point to + // freed memory. This is fixable, but it's simpler just to wait until we're + // done with the BB and erase all at once. + SmallVector ToErase; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -126,70 +258,80 @@ class Vectorizer { bool run(); private: - unsigned getPointerAddressSpace(Value *I); - static const unsigned MaxDepth = 3; - bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth = 0) const; - bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, - unsigned Depth) const; - bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, - unsigned Depth) const; - - /// After vectorization, reorder the instructions that I depends on - /// (the instructions defining its operands), to ensure they dominate I. - void reorder(Instruction *I); - - /// Returns the first and the last instructions in Chain. - std::pair - getBoundaryInstrs(ArrayRef Chain); - - /// Erases the original instructions after vectorizing. - void eraseInstructions(ArrayRef Chain); - - /// "Legalize" the vector type that would be produced by combining \p - /// ElementSizeBits elements in \p Chain. Break into two pieces such that the - /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is - /// expected to have more than 4 elements. - std::pair, ArrayRef> - splitOddVectorElts(ArrayRef Chain, unsigned ElementSizeBits); - - /// Finds the largest prefix of Chain that's vectorizable, checking for - /// intervening instructions which may affect the memory accessed by the - /// instructions within Chain. + /// Runs the vectorizer on a "pseudo basic block", which is a range of + /// instructions [Begin, End) within one BB all of which have + /// isGuaranteedToTransferExecutionToSuccessor(I) == true. + bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); + + /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores + /// in the same BB with the same value for getUnderlyingObject() etc. + bool runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef EqClass); + + /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class + /// where all instructions access a known, constant offset from the first + /// instruction. + bool runOnChain(Chain &C); + + /// Splits the chain into subchains of instructions which read/write a + /// contiguous block of memory. Discards any length-1 subchains (because + /// there's nothing to vectorize in there). + std::vector splitChainByContiguity(Chain &C); + + /// Splits the chain into subchains where it's safe to hoist loads up to the + /// beginning of the sub-chain and it's safe to sink loads up to the end of + /// the sub-chain. Discards any length-1 subchains. + std::vector splitChainByMayAliasInstrs(Chain &C); + + /// Splits the chain into subchains that make legal, aligned accesses. + /// Discards any length-1 subchains. + std::vector splitChainByAlignment(Chain &C); + + /// Converts the instrs in the chain into a single vectorized load or store. + /// Adds the old scalar loads/stores to ToErase. + bool vectorizeChain(Chain &C); + + /// Tries to compute the offset in bytes PtrB - PtrA. + std::optional getConstantOffset(Value *PtrA, Value *PtrB, + unsigned Depth = 0); + std::optional gtConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB, + unsigned Depth); + std::optional getConstantOffsetSelects(Value *PtrA, Value *PtrB, + unsigned Depth); + + /// Gets the element type of the vector that the chain will load or store. + /// This is nontrivial because the chain may contain elements of different + /// types; e.g. it's legal to have a chain that contains both i32 and float. + Type *getChainElemTy(const Chain &C); + + /// Determines whether ChainElem can be moved up (if IsLoad) or down (if + /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias + /// instructions. + /// + /// The map ChainElemOffsets must contain all of the elements in + /// [ChainBegin, ChainElem] and their offsets from some arbitrary base + /// address. It's ok if it contains additional entries. + template + bool isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap &ChainOffsets); + + /// Collects loads and stores grouped by "equivalence class", where: + /// - all elements in an eq class are a load or all are a store, + /// - they all load/store the same element size (it's OK to have e.g. i8 and + /// <4 x i8> in the same class, but not i32 and <4 x i8>), and + /// - they all have the same value for getUnderlyingObject(). + EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End); + + /// Partitions Instrs into "chains" where every instruction has a known + /// constant offset from the first instr in the chain. /// - /// The elements of \p Chain must be all loads or all stores and must be in - /// address order. - ArrayRef getVectorizablePrefix(ArrayRef Chain); - - /// Collects load and store instructions to vectorize. - std::pair collectInstructions(BasicBlock *BB); - - /// Processes the collected instructions, the \p Map. The values of \p Map - /// should be all loads or all stores. - bool vectorizeChains(InstrListMap &Map); - - /// Finds the load/stores to consecutive memory addresses and vectorizes them. - bool vectorizeInstructions(ArrayRef Instrs); - - /// Vectorizes the load instructions in Chain. - bool - vectorizeLoadChain(ArrayRef Chain, - SmallPtrSet *InstructionsProcessed); - - /// Vectorizes the store instructions in Chain. - bool - vectorizeStoreChain(ArrayRef Chain, - SmallPtrSet *InstructionsProcessed); - - /// Check if this load/store access is misaligned accesses. - /// Returns a \p RelativeSpeed of an operation if allowed suitable to - /// compare to another result for the same \p AddressSpace and potentially - /// different \p Alignment and \p SzInBytes. - bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed); + /// Postcondition: For all i, ret[i][0].second == 0, because the first instr + /// in the chain is the leader, and an instr touches distance 0 from itself. + std::vector gatherChains(ArrayRef Instrs); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -197,7 +339,8 @@ class LoadStoreVectorizerLegacyPass : public FunctionPass { static char ID; LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { - initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); + initializeLoadStoreVectorizerLegacyPassPass( + *PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -249,11 +392,11 @@ bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { AssumptionCache &AC = getAnalysis().getAssumptionCache(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - return V.run(); + return Vectorizer(F, AA, AC, DT, SE, TTI).run(); } -PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { +PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, + FunctionAnalysisManager &AM) { // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return PreservedAnalyses::all(); @@ -264,125 +407,678 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisMana TargetTransformInfo &TTI = AM.getResult(F); AssumptionCache &AC = AM.getResult(F); - Vectorizer V(F, AA, AC, DT, SE, TTI); - bool Changed = V.run(); + bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run(); PreservedAnalyses PA; PA.preserveSet(); return Changed ? PA : PreservedAnalyses::all(); } -// The real propagateMetadata expects a SmallVector, but we deal in -// vectors of Instructions. -static void propagateMetadata(Instruction *I, ArrayRef IL) { - SmallVector VL(IL.begin(), IL.end()); - propagateMetadata(I, VL); -} - -// Vectorizer Implementation bool Vectorizer::run() { bool Changed = false; - - // Scan the blocks in the function in post order. + // Break up the BB if there are any instrs which aren't guaranteed to transfer + // execution to their successor. + // + // Consider, for example: + // + // def assert_arr_len(int n) { if (n < 2) exit(); } + // + // load arr[0] + // call assert_array_len(arr.length) + // load arr[1] + // + // Even though assert_arr_len does not read or write any memory, we can't + // speculate the second load before the call. More info at + // https://github.com/llvm/llvm-project/issues/52950. for (BasicBlock *BB : post_order(&F)) { - InstrListMap LoadRefs, StoreRefs; - std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); - Changed |= vectorizeChains(LoadRefs); - Changed |= vectorizeChains(StoreRefs); + // BB must at least have a terminator. + assert(!BB->empty()); + + SmallVector Barriers; + Barriers.push_back(BB->begin()); + for (Instruction &I : *BB) + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + Barriers.push_back(I.getIterator()); + Barriers.push_back(BB->end()); + + for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End; + ++It) + Changed |= runOnPseudoBB(*It, *std::next(It)); + + for (Instruction *I : ToErase) { + auto *PtrOperand = getLoadStorePointerOperand(I); + if (I->use_empty()) + I->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + ToErase.clear(); } return Changed; } -unsigned Vectorizer::getPointerAddressSpace(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; +bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... "; + if (End != Begin->getParent()->end()) + dbgs() << *End; + else + dbgs() << ""; + dbgs() << ")\n"; + }); + + bool Changed = false; + for (const auto &[EqClassKey, EqClass] : + collectEquivalenceClasses(Begin, End)) + Changed |= runOnEquivalenceClass(EqClassKey, EqClass); + + return Changed; } -// FIXME: Merge with llvm::isConsecutiveAccess -bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getLoadStorePointerOperand(A); - Value *PtrB = getLoadStorePointerOperand(B); - unsigned ASA = getPointerAddressSpace(A); - unsigned ASB = getPointerAddressSpace(B); +bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef EqClass) { + bool Changed = false; - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; + LLVM_DEBUG({ + dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() + << " keyed on " << EqClassKey << ":\n"; + for (Instruction *I : EqClass) + dbgs() << " " << *I << "\n"; + }); - // Make sure that A and B are different pointers of the same size type. - Type *PtrATy = getLoadStoreType(A); - Type *PtrBTy = getLoadStoreType(B); - if (PtrA == PtrB || - PtrATy->isVectorTy() != PtrBTy->isVectorTy() || - DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || - DL.getTypeStoreSize(PtrATy->getScalarType()) != - DL.getTypeStoreSize(PtrBTy->getScalarType())) - return false; + std::vector Chains = gatherChains(EqClass); + LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() + << " nontrivial chains.\n";); + for (Chain &C : Chains) + Changed |= runOnChain(C); + return Changed; +} - unsigned PtrOffsetWidth = DL.getIndexSizeInBits(ASA); - APInt Size(PtrOffsetWidth, DL.getTypeStoreSize(PtrATy)); +bool Vectorizer::runOnChain(Chain &C) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; + dumpChain(C); + }); - return areConsecutivePointers(PtrA, PtrB, Size); + // Split up the chain into increasingly smaller chains, until we can finally + // vectorize the chains. + // + // (Don't be scared by the depth of the loop nest here. These operations are + // all at worst O(n lg n) in the number of instructions, and splitting chains + // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) + bool Changed = false; + for (auto &C : splitChainByMayAliasInstrs(C)) + for (auto &C : splitChainByContiguity(C)) + for (auto &C : splitChainByAlignment(C)) + Changed |= vectorizeChain(C); + return Changed; } -bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, - APInt PtrDelta, unsigned Depth) const { - unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); - APInt OffsetA(OffsetBitWidth, 0); - APInt OffsetB(OffsetBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); +std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { + if (C.empty()) + return {}; - unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + sortChainInBBOrder(C); - if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n"; + dumpChain(C); + }); + + // We know that elements in the chain with nonverlapping offsets can't + // alias, but AA may not be smart enough to figure this out. Use a + // hashtable. + DenseMap ChainOffsets; + for (const auto &E : C) + ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader}); + + // Loads get hoisted up to the first load in the chain. Stores get sunk + // down to the last store in the chain. Our algorithm for loads is: + // + // - Take the first element of the chain. This is the start of a new chain. + // - Take the next element of `Chain` and check for may-alias instructions + // up to the start of NewChain. If no may-alias instrs, add it to + // NewChain. Otherwise, start a new NewChain. + // + // For stores it's the same except in the reverse direction. + // + // We expect IsLoad to be an std::bool_constant. + auto Impl = [&](auto IsLoad) { + // MSVC is unhappy if IsLoad is a capture, so pass it as an arg. + auto [ChainBegin, ChainEnd] = [&](auto IsLoad) { + if constexpr (IsLoad()) + return std::make_pair(C.begin(), C.end()); + else + return std::make_pair(C.rbegin(), C.rend()); + }(IsLoad); + assert(ChainBegin != ChainEnd); + + std::vector Chains; + SmallVector NewChain; + NewChain.push_back(*ChainBegin); + for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { + if (isSafeToMove(ChainIt->Inst, NewChain.front().Inst, + ChainOffsets)) { + LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst + << "\n"); + NewChain.push_back(*ChainIt); + } else { + LLVM_DEBUG( + dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " + << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + + // Start a new chain. + NewChain = SmallVector({*ChainIt}); + } + } + if (NewChain.size() > 1) { + LLVM_DEBUG({ + dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; + dumpChain(NewChain); + }); + Chains.push_back(std::move(NewChain)); + } + return Chains; + }; + + if (isa(C[0].Inst)) + return Impl(/*IsLoad=*/std::bool_constant()); + + assert(isa(C[0].Inst)); + return Impl(/*IsLoad=*/std::bool_constant()); +} + +std::vector Vectorizer::splitChainByContiguity(Chain &C) { + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByContiguity considering chain:\n"; + dumpChain(C); + }); + + std::vector Ret; + Ret.push_back({C.front()}); + + for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) { + // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd). + auto &CurChain = Ret.back(); + const ChainElem &Prev = CurChain.back(); + unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); + assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " + "collectEquivalenceClass"); + APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; + + // Add this instruction to the end of the current chain, or start a new one. + bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; + LLVM_DEBUG(dbgs() << "LSV: Instructions are " + << (AreContiguous ? "" : "not ") << "contiguous: " + << *Prev.Inst << " (ends at offset " << PrevReadEnd + << ") -> " << *It->Inst << " (starts at offset " + << It->OffsetFromLeader << ")\n"); + if (AreContiguous) + CurChain.push_back(*It); + else + Ret.push_back({*It}); + } + + // Filter out length-1 chains, these are uninteresting. + llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); + return Ret; +} + +Type *Vectorizer::getChainElemTy(const Chain &C) { + assert(!C.empty()); + // The rules are: + // - If there are any pointer types in the chain, use an integer type. + // - Prefer an integer type if it appears in the chain. + // - Otherwise, use the first type in the chain. + // + // The rule about pointer types is a simplification when we merge e.g. a load + // of a ptr and a double. There's no direct conversion from a ptr to a + // double; it requires a ptrtoint followed by a bitcast. + // + // It's unclear to me if the other rules have any practical effect, but we do + // it to match this pass's previous behavior. + if (any_of(C, [](const ChainElem &E) { + return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); + })) { + return Type::getIntNTy( + F.getContext(), + DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); + } + + for (const ChainElem &E : C) + if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy()) + return T; + return getLoadStoreType(C[0].Inst)->getScalarType(); +} + +std::vector Vectorizer::splitChainByAlignment(Chain &C) { + // We use a simple greedy algorithm. + // - Given a chain of length N, find all prefixes that + // (a) are not longer than the max register length, and + // (b) are a power of 2. + // - Starting from the longest prefix, try to create a vector of that length. + // - If one of them works, great. Repeat the algorithm on any remaining + // elements in the chain. + // - If none of them work, discard the first element and repeat on a chain + // of length N-1. + if (C.empty()) + return {}; + + sortChainInOffsetOrder(C); + + LLVM_DEBUG({ + dbgs() << "LSV: splitChainByAlignment considering chain:\n"; + dumpChain(C); + }); + + bool IsLoadChain = isa(C[0].Inst); + auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize, + unsigned ChainSizeBytes, VectorType *VecTy) { + return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy) + : TTI.getStoreVectorFactor(VF, LoadStoreSize, + ChainSizeBytes, VecTy); + }; + +#ifndef NDEBUG + for (const auto &E : C) { + Type *Ty = getLoadStoreType(E.Inst)->getScalarType(); + assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) && + "Should have filtered out non-power-of-two elements in " + "collectEquivalenceClasses."); + } +#endif + + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + + std::vector Ret; + for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { + // Find candidate chains of size not greater than the largest vector reg. + // These chains are over the closed interval [CBegin, CEnd]. + SmallVector, 8> + CandidateChains; + for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) { + APInt Sz = C[CEnd].OffsetFromLeader + + DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) - + C[CBegin].OffsetFromLeader; + if (Sz.sgt(VecRegBytes)) + break; + CandidateChains.push_back( + {CEnd, static_cast(Sz.getLimitedValue())}); + } + + // Consider the longest chain first. + for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend(); + It != End; ++It) { + auto [CEnd, SizeBytes] = *It; + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment considering candidate chain [" + << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n"); + + Type *VecElemTy = getChainElemTy(C); + // Note, VecElemTy is a power of 2, but might be less than one byte. For + // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case + // VecElemTy would be i4. + unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy); + + // SizeBytes and VecElemBits are powers of 2, so they divide evenly. + assert((8 * SizeBytes) % VecElemBits == 0); + unsigned NumVecElems = 8 * SizeBytes / VecElemBits; + FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems); + unsigned VF = 8 * VecRegBytes / VecElemBits; + + // Check that TTI is happy with this vectorization factor. + unsigned TargetVF = getVectorFactor(VF, VecElemBits, + VecElemBits * NumVecElems / 8, VecTy); + if (TargetVF != VF && TargetVF < NumVecElems) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because TargetVF=" + << TargetVF << " != VF=" << VF + << " and TargetVF < NumVecElems=" << NumVecElems << "\n"); + continue; + } + + // Is a load/store with this alignment allowed by TTI and at least as fast + // as an unvectorized load/store? + // + // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). + auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, + &F = F](Align Alignment) { + if (Alignment.value() % SizeBytes == 0) + return true; + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() + << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; + }; + + // If we're loading/storing from an alloca, align it if possible. + // + // FIXME: We eagerly upgrade the alignment, regardless of whether TTI + // tells us this is beneficial. This feels a bit odd, but it matches + // existing tests. This isn't *so* bad, because at most we align to 4 + // bytes (current value of StackAdjustedAlignment). + // + // FIXME: We will upgrade the alignment of the alloca even if it turns out + // we can't vectorize for some other reason. + Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); + if (AS == DL.getAllocaAddrSpace() && Alignment.value() % SizeBytes != 0 && + IsAllowedAndFast(Align(StackAdjustedAlignment))) { + Align NewAlign = getOrEnforceKnownAlignment( + getLoadStorePointerOperand(C[CBegin].Inst), + Align(StackAdjustedAlignment), DL, C[CBegin].Inst, nullptr, &DT); + if (NewAlign >= Alignment) { + LLVM_DEBUG(dbgs() + << "LSV: splitByChain upgrading alloca alignment from " + << Alignment.value() << " to " << NewAlign.value() + << "\n"); + Alignment = NewAlign; + } + } + + if (!IsAllowedAndFast(Alignment)) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because its alignment is not AllowedAndFast: " + << Alignment.value() << "\n"); + continue; + } + + if ((IsLoadChain && + !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) || + (!IsLoadChain && + !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) { + LLVM_DEBUG( + dbgs() << "LSV: splitChainByAlignment discarding candidate chain " + "because !isLegalToVectorizeLoad/StoreChain."); + continue; + } + + // Hooray, we can vectorize this chain! + Chain &NewChain = Ret.emplace_back(); + for (unsigned I = CBegin; I <= CEnd; ++I) + NewChain.push_back(C[I]); + CBegin = CEnd; // Skip over the instructions we've added to the chain. + break; + } + } + return Ret; +} + +bool Vectorizer::vectorizeChain(Chain &C) { + if (C.size() < 2) return false; - // In case if we have to shrink the pointer - // stripAndAccumulateInBoundsConstantOffsets should properly handle a - // possible overflow and the value should fit into a smallest data type - // used in the cast/gep chain. - assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && - OffsetB.getSignificantBits() <= NewPtrBitWidth); + sortChainInOffsetOrder(C); - OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); - OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); - PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); + LLVM_DEBUG({ + dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n"; + dumpChain(C); + }); - APInt OffsetDelta = OffsetB - OffsetA; + Type *VecElemTy = getChainElemTy(C); + bool IsLoadChain = isa(C[0].Inst); + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + unsigned ChainBytes = std::accumulate( + C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) { + return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst)); + }); + assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0); + // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller + // than 1 byte (e.g. VecTy == <32 x i1>). + Type *VecTy = FixedVectorType::get( + VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy)); + + Align Alignment = getLoadStoreAlignment(C[0].Inst); + // If this is a load/store of an alloca, we might have upgraded the alloca's + // alignment earlier. Get the new alignment. + if (AS == DL.getAllocaAddrSpace()) { + Alignment = std::max( + Alignment, + getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst), + MaybeAlign(), DL, C[0].Inst, nullptr, &DT)); + } - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == PtrDelta; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the pointer delta requested. - APInt BaseDelta = PtrDelta - OffsetDelta; - - // Compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE.getSCEV(PtrA); - const SCEV *PtrSCEVB = SE.getSCEV(PtrB); - const SCEV *C = SE.getConstant(BaseDelta); - const SCEV *X = SE.getAddExpr(PtrSCEVA, C); - if (X == PtrSCEVB) + // All elements of the chain must have the same scalar-type size. +#ifndef NDEBUG + for (const ChainElem &E : C) + assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) == + DL.getTypeStoreSize(VecElemTy)); +#endif + + Instruction *VecInst; + if (IsLoadChain) { + // Loads get hoisted to the location of the first load in the chain. We may + // also need to hoist the (transitive) operands of the loads. + Builder.SetInsertPoint( + std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + Value *Bitcast = Builder.CreateBitCast( + getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS)); + VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); + + unsigned VecIdx = 0; + for (const ChainElem &E : C) { + Instruction *I = E.Inst; + Value *V; + Type *T = getLoadStoreType(I); + if (auto *VT = dyn_cast(T)) { + auto Mask = llvm::to_vector<8>( + llvm::seq(VecIdx, VecIdx + VT->getNumElements())); + V = Builder.CreateShuffleVector(VecInst, Mask, I->getName()); + VecIdx += VT->getNumElements(); + } else { + V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx), + I->getName()); + ++VecIdx; + } + if (V->getType() != I->getType()) + V = Builder.CreateBitOrPointerCast(V, I->getType()); + I->replaceAllUsesWith(V); + } + + // Finally, we need to reorder the instrs in the BB so that the (transitive) + // operands of VecInst appear before it. To see why, suppose we have + // vectorized the following code: + // + // ptr1 = gep a, 1 + // load1 = load i32 ptr1 + // ptr0 = gep a, 0 + // load0 = load i32 ptr0 + // + // We will put the vectorized load at the location of the earliest load in + // the BB, i.e. load1. We get: + // + // ptr1 = gep a, 1 + // loadv = load <2 x i32> ptr0 + // load0 = extractelement loadv, 0 + // load1 = extractelement loadv, 1 + // ptr0 = gep a, 0 + // + // Notice that loadv uses ptr0, which is defined *after* it! + reorder(VecInst); + } else { + // Stores get sunk to the location of the last store in the chain. + Builder.SetInsertPoint( + std::max_element(C.begin(), C.end(), [](auto &A, auto &B) { + return A.Inst->comesBefore(B.Inst); + })->Inst); + + // Build the vector to store. + Value *Vec = PoisonValue::get(VecTy); + unsigned VecIdx = 0; + auto InsertElem = [&](Value *V) { + if (V->getType() != VecElemTy) + V = Builder.CreateBitOrPointerCast(V, VecElemTy); + Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++)); + }; + for (const ChainElem &E : C) { + auto I = cast(E.Inst); + if (FixedVectorType *VT = + dyn_cast(getLoadStoreType(I))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + InsertElem(Builder.CreateExtractElement(I->getValueOperand(), + Builder.getInt32(J))); + } + } else { + InsertElem(I->getValueOperand()); + } + } + + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, + Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst), + VecTy->getPointerTo(AS)), + Alignment); + } + + propagateMetadata(VecInst, C); + + for (const ChainElem &E : C) + ToErase.push_back(E.Inst); + + ++NumVectorInstructions; + NumScalarsVectorized += C.size(); + return true; +} + +template +bool Vectorizer::isSafeToMove( + Instruction *ChainElem, Instruction *ChainBegin, + const DenseMap &ChainOffsets) { + LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " + << *ChainBegin << ")\n"); + + assert(isa(ChainElem) == IsLoadChain); + if (ChainElem == ChainBegin) return true; - // The above check will not catch the cases where one of the pointers is - // factorized but the other one is not, such as (C + (S * (A + B))) vs - // (AS + BS). Get the minus scev. That will allow re-combining the expresions - // and getting the simplified difference. - const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); - if (C == Dist) + // Invariant loads can always be reordered; by definition they are not + // clobbered by stores. + if (isInvariantLoad(ChainElem)) return true; - // Sometimes even this doesn't work, because SCEV can't always see through - // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking - // things the hard way. - return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); + auto BBIt = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainElem); + else + return BasicBlock::iterator(ChainElem); + }()); + auto BBItEnd = std::next([&] { + if constexpr (IsLoadChain) + return BasicBlock::reverse_iterator(ChainBegin); + else + return BasicBlock::iterator(ChainBegin); + }()); + + const APInt &ChainElemOffset = ChainOffsets.at(ChainElem); + const unsigned ChainElemSize = + DL.getTypeStoreSize(getLoadStoreType(ChainElem)); + + for (; BBIt != BBItEnd; ++BBIt) { + Instruction *I = &*BBIt; + + if (!I->mayReadOrWriteMemory()) + continue; + + // Loads can be reordered with other loads. + if (IsLoadChain && isa(I)) + continue; + + // Stores can be sunk below invariant loads. + if (!IsLoadChain && isInvariantLoad(I)) + continue; + + // If I is in the chain, we can tell whether it aliases ChainIt by checking + // what offset ChainIt accesses. This may be better than AA is able to do. + // + // We should really only have duplicate offsets for stores (the duplicate + // loads should be CSE'ed), but in case we have a duplicate load, we'll + // split the chain so we don't have to handle this case specially. + if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) { + // I and ChainElem overlap if: + // - I and ChainElem have the same offset, OR + // - I's offset is less than ChainElem's, but I touches past the + // beginning of ChainElem, OR + // - ChainElem's offset is less than I's, but ChainElem touches past the + // beginning of I. + const APInt &IOffset = OffsetIt->second; + unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I)); + if (IOffset == ChainElemOffset || + (IOffset.sle(ChainElemOffset) && + (IOffset + IElemSize).sgt(ChainElemOffset)) || + (ChainElemOffset.sle(IOffset) && + (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) { + LLVM_DEBUG({ + // Double check that AA also sees this alias. If not, we probably + // have a bug. + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); + dbgs() << "LSV: Found alias in chain: " << *I << "\n"; + }); + return false; // We found an aliasing instruction; bail. + } + + continue; // We're confident there's no alias. + } + + LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n"); + ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { + LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" + << " Aliasing instruction:\n" + << " " << *I << '\n' + << " Aliased instruction and pointer:\n" + << " " << *ChainElem << '\n' + << " " << *getLoadStorePointerOperand(ChainElem) + << '\n'); + + return false; + } + } + return true; } static bool checkNoWrapFlags(Instruction *I, bool Signed) { @@ -394,10 +1090,14 @@ static bool checkNoWrapFlags(Instruction *I, bool Signed) { static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed) { - // If both OpA and OpB is an add with NSW/NUW and with - // one of the operands being the same, we can guarantee that the - // transformation is safe if we can prove that OpA won't overflow when - // IdxDiff added to the other operand of OpA. + LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff + << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA=" + << MatchingOpIdxA << ", AddOpB=" << *AddOpB + << ", MatchingOpIdxB=" << MatchingOpIdxB + << ", Signed=" << Signed << "\n"); + // If both OpA and OpB are adds with NSW/NUW and with one of the operands + // being the same, we can guarantee that the transformation is safe if we can + // prove that OpA won't overflow when Ret added to the other operand of OpA. // For example: // %tmp7 = add nsw i32 %tmp2, %v0 // %tmp8 = sext i32 %tmp7 to i64 @@ -406,10 +1106,9 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, // %tmp12 = add nsw i32 %tmp2, %tmp11 // %tmp13 = sext i32 %tmp12 to i64 // - // Both %tmp7 and %tmp2 has the nsw flag and the first operand - // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow - // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the - // nsw flag. + // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2. + // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds + // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag. assert(AddOpA->getOpcode() == Instruction::Add && AddOpB->getOpcode() == Instruction::Add && checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); @@ -460,24 +1159,26 @@ static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, return false; } -bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, - APInt PtrDelta, - unsigned Depth) const { +std::optional Vectorizer::gtConstantOffsetComplexAddrs(Value *PtrA, + Value *PtrB, + unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs PtrA=" << *PtrA + << " PtrB=" << *PtrB << " Depth=" << Depth << "\n"); auto *GEPA = dyn_cast(PtrA); auto *GEPB = dyn_cast(PtrB); if (!GEPA || !GEPB) - return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); + return getConstantOffsetSelects(PtrA, PtrB, Depth); // Look through GEPs after checking they're the same except for the last // index. if (GEPA->getNumOperands() != GEPB->getNumOperands() || GEPA->getPointerOperand() != GEPB->getPointerOperand()) - return false; + return std::nullopt; gep_type_iterator GTIA = gep_type_begin(GEPA); gep_type_iterator GTIB = gep_type_begin(GEPB); for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { if (GTIA.getOperand() != GTIB.getOperand()) - return false; + return std::nullopt; ++GTIA; ++GTIB; } @@ -486,23 +1187,13 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Instruction *OpB = dyn_cast(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) - return false; + return std::nullopt; - if (PtrDelta.isNegative()) { - if (PtrDelta.isMinSignedValue()) - return false; - PtrDelta.negate(); - std::swap(OpA, OpB); - } uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); - if (PtrDelta.urem(Stride) != 0) - return false; - unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); - APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth); // Only look through a ZExt/SExt. if (!isa(OpA) && !isa(OpA)) - return false; + return std::nullopt; bool Signed = isa(OpA); @@ -510,7 +1201,21 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, Value *ValA = OpA->getOperand(0); OpB = dyn_cast(OpB->getOperand(0)); if (!OpB || ValA->getType() != OpB->getType()) - return false; + return std::nullopt; + + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); + const SCEV *OffsetSCEVB = SE.getSCEV(OpB); + const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); + if (IdxDiffSCEV == SE.getCouldNotCompute()) + return std::nullopt; + + ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV); + if (!IdxDiffRange.isSingleElement()) + return std::nullopt; + APInt IdxDiff = *IdxDiffRange.getSingleElement(); + + LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs IdxDiff=" << IdxDiff + << "\n"); // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; @@ -529,10 +1234,9 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && checkNoWrapFlags(OpB, Signed)) { - // In the checks below a matching operand in OpA and OpB is - // an operand which is the same in those two instructions. - // Below we account for possible orders of the operands of - // these add instructions. + // In the checks below a matching operand in OpA and OpB is an operand which + // is the same in those two instructions. Below we account for possible + // orders of the operands of these add instructions. for (unsigned MatchingOpIdxA : {0, 1}) for (unsigned MatchingOpIdxB : {0, 1}) if (!Safe) @@ -543,804 +1247,255 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Third attempt: - // If all set bits of IdxDiff or any higher order bit other than the sign bit - // are known to be zero in ValA, we can add Diff to it while guaranteeing no - // overflow of any sort. + // + // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher + // order bit other than the sign bit are known to be zero in ValA, we can add + // Diff to it while guaranteeing no overflow of any sort. + // + // If IdxDiff is negative, do the same, but swap ValA and ValB. if (!Safe) { + // When computing known bits, use the GEPs as context instructions, since + // they likely are in the same BB as the load/store. + Instruction *ContextInst = GEPA->comesBefore(GEPB) ? GEPB : GEPA; KnownBits Known(BitWidth); - computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT); + computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC, + ContextInst, &DT); APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); if (Signed) BitsAllowedToBeSet.clearBit(BitWidth - 1); - if (BitsAllowedToBeSet.ult(IdxDiff)) - return false; + if (BitsAllowedToBeSet.ult(IdxDiff.abs())) + return std::nullopt; + Safe = true; } - const SCEV *OffsetSCEVA = SE.getSCEV(ValA); - const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); - const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); - return X == OffsetSCEVB; + if (Safe) + return IdxDiff * Stride; + return std::nullopt; } -bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, - const APInt &PtrDelta, - unsigned Depth) const { +std::optional +Vectorizer::getConstantOffsetSelects(Value *PtrA, Value *PtrB, unsigned Depth) { if (Depth++ == MaxDepth) - return false; + return std::nullopt; if (auto *SelectA = dyn_cast(PtrA)) { if (auto *SelectB = dyn_cast(PtrB)) { - return SelectA->getCondition() == SelectB->getCondition() && - areConsecutivePointers(SelectA->getTrueValue(), - SelectB->getTrueValue(), PtrDelta, Depth) && - areConsecutivePointers(SelectA->getFalseValue(), - SelectB->getFalseValue(), PtrDelta, Depth); + if (SelectA->getCondition() != SelectB->getCondition()) + return std::nullopt; + LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n"); + std::optional TrueDiff = getConstantOffset( + SelectA->getTrueValue(), SelectB->getTrueValue(), Depth); + if (!TrueDiff.has_value()) + return std::nullopt; + std::optional FalseDiff = getConstantOffset( + SelectA->getFalseValue(), SelectB->getFalseValue(), Depth); + if (TrueDiff == FalseDiff) + return TrueDiff; } } - return false; + return std::nullopt; } -void Vectorizer::reorder(Instruction *I) { - SmallPtrSet InstructionsToMove; - SmallVector Worklist; - - Worklist.push_back(I); - while (!Worklist.empty()) { - Instruction *IW = Worklist.pop_back_val(); - int NumOperands = IW->getNumOperands(); - for (int i = 0; i < NumOperands; i++) { - Instruction *IM = dyn_cast(IW->getOperand(i)); - if (!IM || IM->getOpcode() == Instruction::PHI) - continue; - - // If IM is in another BB, no need to move it, because this pass only - // vectorizes instructions within one BB. - if (IM->getParent() != I->getParent()) - continue; - - if (!IM->comesBefore(I)) { - InstructionsToMove.insert(IM); - Worklist.push_back(IM); - } +EquivalenceClassMap +Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + EquivalenceClassMap Ret; + + auto getUnderlyingObject = [](const Value *Ptr) -> const Value * { + const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); + if (const auto *Sel = dyn_cast(ObjPtr)) { + // The select's themselves are distinct instructions even if they share + // the same condition and evaluate to consecutive pointers for true and + // false values of the condition. Therefore using the select's themselves + // for grouping instructions would put consecutive accesses into different + // lists and they won't be even checked for being consecutive, and won't + // be vectorized. + return Sel->getCondition(); } - } + return ObjPtr; + }; - // All instructions to move should follow I. Start from I, not from begin(). - for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; - ++BBI) { - if (!InstructionsToMove.count(&*BBI)) + for (Instruction &I : make_range(Begin, End)) { + auto *LI = dyn_cast(&I); + auto *SI = dyn_cast(&I); + if (!LI && !SI) continue; - Instruction *IM = &*BBI; - --BBI; - IM->removeFromParent(); - IM->insertBefore(I); - } -} - -std::pair -Vectorizer::getBoundaryInstrs(ArrayRef Chain) { - Instruction *C0 = Chain[0]; - BasicBlock::iterator FirstInstr = C0->getIterator(); - BasicBlock::iterator LastInstr = C0->getIterator(); - BasicBlock *BB = C0->getParent(); - unsigned NumFound = 0; - for (Instruction &I : *BB) { - if (!is_contained(Chain, &I)) + if ((LI && !LI->isSimple()) || (SI && !SI->isSimple())) continue; - ++NumFound; - if (NumFound == 1) { - FirstInstr = I.getIterator(); - } - if (NumFound == Chain.size()) { - LastInstr = I.getIterator(); - break; - } - } - - // Range is [first, last). - return std::make_pair(FirstInstr, ++LastInstr); -} - -void Vectorizer::eraseInstructions(ArrayRef Chain) { - SmallVector Instrs; - for (Instruction *I : Chain) { - Value *PtrOperand = getLoadStorePointerOperand(I); - assert(PtrOperand && "Instruction must have a pointer operand."); - Instrs.push_back(I); - if (GetElementPtrInst *GEP = dyn_cast(PtrOperand)) - Instrs.push_back(GEP); - } - - // Erase instructions. - for (Instruction *I : Instrs) - if (I->use_empty()) - I->eraseFromParent(); -} - -std::pair, ArrayRef> -Vectorizer::splitOddVectorElts(ArrayRef Chain, - unsigned ElementSizeBits) { - unsigned ElementSizeBytes = ElementSizeBits / 8; - unsigned SizeBytes = ElementSizeBytes * Chain.size(); - unsigned LeftBytes = (SizeBytes - (SizeBytes % 4)); - // If we're already a multiple of 4 bytes or the whole chain is shorter than 4 - // bytes, then try splitting down on power-of-2 boundary. - if (LeftBytes == SizeBytes || LeftBytes == 0) - LeftBytes = PowerOf2Ceil(SizeBytes) / 2; - unsigned NumLeft = LeftBytes / ElementSizeBytes; - if (NumLeft == 0) - NumLeft = 1; - LLVM_DEBUG(dbgs() << "LSV: Splitting the chain into " << NumLeft << "+" - << Chain.size() - NumLeft << " elements\n"); - return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); -} - -ArrayRef -Vectorizer::getVectorizablePrefix(ArrayRef Chain) { - // These are in BB order, unlike Chain, which is in address order. - SmallVector MemoryInstrs; - SmallVector ChainInstrs; - - bool IsLoadChain = isa(Chain[0]); - LLVM_DEBUG({ - for (Instruction *I : Chain) { - if (IsLoadChain) - assert(isa(I) && - "All elements of Chain must be loads, or all must be stores."); - else - assert(isa(I) && - "All elements of Chain must be loads, or all must be stores."); - } - }); - - for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { - if ((isa(I) || isa(I)) && is_contained(Chain, &I)) { - ChainInstrs.push_back(&I); + if ((LI && !TTI.isLegalToVectorizeLoad(LI)) || + (SI && !TTI.isLegalToVectorizeStore(SI))) continue; - } - if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { - LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " - << I << '\n'); - break; - } - if (I.mayReadOrWriteMemory()) - MemoryInstrs.push_back(&I); - } - - // Loop until we find an instruction in ChainInstrs that we can't vectorize. - unsigned ChainInstrIdx = 0; - Instruction *BarrierMemoryInstr = nullptr; - - for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { - Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; - - // If a barrier memory instruction was found, chain instructions that follow - // will not be added to the valid prefix. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) - break; - - // Check (in BB order) if any instruction prevents ChainInstr from being - // vectorized. Find and store the first such "conflicting" instruction. - for (Instruction *MemInstr : MemoryInstrs) { - // If a barrier memory instruction was found, do not check past it. - if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) - break; - auto *MemLoad = dyn_cast(MemInstr); - auto *ChainLoad = dyn_cast(ChainInstr); - if (MemLoad && ChainLoad) - continue; - - // We can ignore the alias if the we have a load store pair and the load - // is known to be invariant. The load cannot be clobbered by the store. - auto IsInvariantLoad = [](const LoadInst *LI) -> bool { - return LI->hasMetadata(LLVMContext::MD_invariant_load); - }; - - if (IsLoadChain) { - // We can ignore the alias as long as the load comes before the store, - // because that means we won't be moving the load past the store to - // vectorize it (the vectorized load is inserted at the location of the - // first load in the chain). - if (ChainInstr->comesBefore(MemInstr) || - (ChainLoad && IsInvariantLoad(ChainLoad))) - continue; - } else { - // Same case, but in reverse. - if (MemInstr->comesBefore(ChainInstr) || - (MemLoad && IsInvariantLoad(MemLoad))) - continue; - } - - ModRefInfo MR = - AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr)); - if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { - LLVM_DEBUG({ - dbgs() << "LSV: Found alias:\n" - " Aliasing instruction:\n" - << " " << *MemInstr << '\n' - << " Aliased instruction and pointer:\n" - << " " << *ChainInstr << '\n' - << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; - }); - // Save this aliasing memory instruction as a barrier, but allow other - // instructions that precede the barrier to be vectorized with this one. - BarrierMemoryInstr = MemInstr; - break; - } - } - // Continue the search only for store chains, since vectorizing stores that - // precede an aliasing load is valid. Conversely, vectorizing loads is valid - // up to an aliasing store, but should not pull loads from further down in - // the basic block. - if (IsLoadChain && BarrierMemoryInstr) { - // The BarrierMemoryInstr is a store that precedes ChainInstr. - assert(BarrierMemoryInstr->comesBefore(ChainInstr)); - break; - } - } - - // Find the largest prefix of Chain whose elements are all in - // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of - // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB - // order.) - SmallPtrSet VectorizableChainInstrs( - ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); - unsigned ChainIdx = 0; - for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { - if (!VectorizableChainInstrs.count(Chain[ChainIdx])) - break; - } - return Chain.slice(0, ChainIdx); -} - -static ChainID getChainID(const Value *Ptr) { - const Value *ObjPtr = getUnderlyingObject(Ptr); - if (const auto *Sel = dyn_cast(ObjPtr)) { - // The select's themselves are distinct instructions even if they share the - // same condition and evaluate to consecutive pointers for true and false - // values of the condition. Therefore using the select's themselves for - // grouping instructions would put consecutive accesses into different lists - // and they won't be even checked for being consecutive, and won't be - // vectorized. - return Sel->getCondition(); - } - return ObjPtr; -} - -std::pair -Vectorizer::collectInstructions(BasicBlock *BB) { - InstrListMap LoadRefs; - InstrListMap StoreRefs; - - for (Instruction &I : *BB) { - if (!I.mayReadOrWriteMemory()) + Type *Ty = getLoadStoreType(&I); + if (!VectorType::isValidElementType(Ty->getScalarType())) continue; - if (LoadInst *LI = dyn_cast(&I)) { - if (!LI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeLoad(LI)) - continue; - - Type *Ty = LI->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - Value *Ptr = LI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save the load locations. - const ChainID ID = getChainID(Ptr); - LoadRefs[ID].push_back(LI); - } else if (StoreInst *SI = dyn_cast(&I)) { - if (!SI->isSimple()) - continue; - - // Skip if it's not legal. - if (!TTI.isLegalToVectorizeStore(SI)) - continue; - - Type *Ty = SI->getValueOperand()->getType(); - if (!VectorType::isValidElementType(Ty->getScalarType())) - continue; - - // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain - // functions are currently using an integer type for the vectorized - // load/store, and does not support casting between the integer type and a - // vector of pointers (e.g. i64 to <2 x i16*>) - if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) - continue; - - // Skip weird non-byte sizes. They probably aren't worth the effort of - // handling correctly. - unsigned TySize = DL.getTypeSizeInBits(Ty); - if ((TySize % 8) != 0) - continue; - - Value *Ptr = SI->getPointerOperand(); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - - unsigned VF = VecRegSize / TySize; - VectorType *VecTy = dyn_cast(Ty); - - // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2 || - (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) - continue; - - // Save store location. - const ChainID ID = getChainID(Ptr); - StoreRefs[ID].push_back(SI); - } - } - - return {LoadRefs, StoreRefs}; -} - -bool Vectorizer::vectorizeChains(InstrListMap &Map) { - bool Changed = false; - - for (const std::pair &Chain : Map) { - unsigned Size = Chain.second.size(); - if (Size < 2) + // Skip weird non-byte sizes. They probably aren't worth the effort of + // handling correctly. + unsigned TySize = DL.getTypeSizeInBits(Ty); + if ((TySize % 8) != 0) continue; - LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); - - // Process the stores in chunks of 64. - for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { - unsigned Len = std::min(CE - CI, 64); - ArrayRef Chunk(&Chain.second[CI], Len); - Changed |= vectorizeInstructions(Chunk); - } - } - - return Changed; -} - -bool Vectorizer::vectorizeInstructions(ArrayRef Instrs) { - LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() - << " instructions.\n"); - SmallVector Heads, Tails; - int ConsecutiveChain[64]; - - // Do a quadratic search on all of the given loads/stores and find all of the - // pairs of loads/stores that follow each other. - for (int i = 0, e = Instrs.size(); i < e; ++i) { - ConsecutiveChain[i] = -1; - for (int j = e - 1; j >= 0; --j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Instrs[i], Instrs[j])) { - if (ConsecutiveChain[i] != -1) { - int CurDistance = std::abs(ConsecutiveChain[i] - i); - int NewDistance = std::abs(ConsecutiveChain[i] - j); - if (j < i || NewDistance > CurDistance) - continue; // Should not insert. - } + // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain + // functions are currently using an integer type for the vectorized + // load/store, and does not support casting between the integer type and a + // vector of pointers (e.g. i64 to <2 x i16*>) + if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) + continue; - Tails.push_back(j); - Heads.push_back(i); - ConsecutiveChain[i] = j; - } - } - } + Value *Ptr = getLoadStorePointerOperand(&I); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - bool Changed = false; - SmallPtrSet InstructionsProcessed; + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast(Ty); - for (int Head : Heads) { - if (InstructionsProcessed.count(Instrs[Head])) - continue; - bool LongerChainExists = false; - for (unsigned TIt = 0; TIt < Tails.size(); TIt++) - if (Head == Tails[TIt] && - !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - LongerChainExists = true; - break; - } - if (LongerChainExists) + // Only handle power-of-two sized elements. + if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) || + (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType())))) continue; - // We found an instr that starts a chain. Now follow the chain and try to - // vectorize it. - SmallVector Operands; - int I = Head; - while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { - if (InstructionsProcessed.count(Instrs[I])) - break; - - Operands.push_back(Instrs[I]); - I = ConsecutiveChain[I]; - } - - bool Vectorized = false; - if (isa(*Operands.begin())) - Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); - else - Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); + // No point in looking at these if they're too big to vectorize. + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) + continue; - Changed |= Vectorized; + Ret[{getUnderlyingObject(Ptr), AS, + DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), + /*IsLoad=*/LI != nullptr}] + .push_back(&I); } - return Changed; + return Ret; } -bool Vectorizer::vectorizeStoreChain( - ArrayRef Chain, - SmallPtrSet *InstructionsProcessed) { - StoreInst *S0 = cast(Chain[0]); +std::vector Vectorizer::gatherChains(ArrayRef Instrs) { + if (Instrs.empty()) + return {}; - // If the vector has an int element, default to int for the whole store. - Type *StoreTy = nullptr; - for (Instruction *I : Chain) { - StoreTy = cast(I)->getValueOperand()->getType(); - if (StoreTy->isIntOrIntVectorTy()) - break; - - if (StoreTy->isPtrOrPtrVectorTy()) { - StoreTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(StoreTy)); - break; - } - } - assert(StoreTy && "Failed to find store type"); - - unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned AS = S0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = S0->getAlign(); - - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - - ArrayRef NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } + unsigned AS = getLoadStoreAddressSpace(Instrs[0]); + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - - FixedVectorType *VecTy; - auto *VecStoreTy = dyn_cast(StoreTy); - if (VecStoreTy) - VecTy = FixedVectorType::get(StoreTy->getScalarType(), - Chain.size() * VecStoreTy->getNumElements()); - else - VecTy = FixedVectorType::get(StoreTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; +#ifndef NDEBUG + // Check that Instrs is in BB order and all have the same addr space. + for (size_t I = 1; I < Instrs.size(); ++I) { + assert(Instrs[I - 1]->comesBefore(Instrs[I])); + assert(getLoadStoreAddressSpace(Instrs[I]) == AS); } +#endif - LLVM_DEBUG({ - dbgs() << "LSV: Stores to vectorize:\n"; - for (Instruction *I : Chain) - dbgs() << " " << *I << "\n"; - }); - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the store is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; + // Machinery to build an MRU-hashtable of Chains. + // + // (Ideally this could be done with MapVector, but as currently implemented, + // moving an element to the front of a MapVector is O(n).) + struct InstrListElem : ilist_node, + std::pair { + explicit InstrListElem(Instruction *I) + : std::pair(I, {}) {} + }; + struct InstrListElemDenseMapInfo { + using PtrInfo = DenseMapInfo; + using IInfo = DenseMapInfo; + static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); } + static InstrListElem *getTombstoneKey() { + return PtrInfo::getTombstoneKey(); } - - Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, S0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; - } - - if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); - return Vectorized; - } - - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*Last); - - Value *Vec = PoisonValue::get(VecTy); - - if (VecStoreTy) { - unsigned VecWidth = VecStoreTy->getNumElements(); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast(Chain[I]); - for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { - unsigned NewIdx = J + I * VecWidth; - Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), - Builder.getInt32(J)); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); - Vec = Insert; - } + static unsigned getHashValue(const InstrListElem *E) { + return IInfo::getHashValue(E->first); } - } else { - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - StoreInst *Store = cast(Chain[I]); - Value *Extract = Store->getValueOperand(); - if (Extract->getType() != StoreTy->getScalarType()) - Extract = - Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); - - Value *Insert = - Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); - Vec = Insert; + static bool isEqual(const InstrListElem *A, const InstrListElem *B) { + if (A == getEmptyKey() || B == getEmptyKey()) + return A == getEmptyKey() && B == getEmptyKey(); + if (A == getTombstoneKey() || B == getTombstoneKey()) + return A == getTombstoneKey() && B == getTombstoneKey(); + return IInfo::isEqual(A->first, B->first); } - } - - StoreInst *SI = Builder.CreateAlignedStore( - Vec, - Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), - Alignment); - propagateMetadata(SI, Chain); - - eraseInstructions(Chain); - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::vectorizeLoadChain( - ArrayRef Chain, - SmallPtrSet *InstructionsProcessed) { - LoadInst *L0 = cast(Chain[0]); - - // If the vector has an int element, default to int for the whole load. - Type *LoadTy = nullptr; - for (const auto &V : Chain) { - LoadTy = cast(V)->getType(); - if (LoadTy->isIntOrIntVectorTy()) - break; - - if (LoadTy->isPtrOrPtrVectorTy()) { - LoadTy = Type::getIntNTy(F.getParent()->getContext(), - DL.getTypeSizeInBits(LoadTy)); - break; + }; + SpecificBumpPtrAllocator Allocator; + simple_ilist MRU; + DenseSet Chains; + + // Compare each instruction in `instrs` to leader of the N most recently-used + // chains. This limits the O(n^2) behavior of this pass while also allowing + // us to build arbitrarily long chains. + for (Instruction *I : Instrs) { + constexpr size_t MaxChainsToTry = 64; + + bool MatchFound = false; + auto ChainIter = MRU.begin(); + for (int J = 0; J < MaxChainsToTry && ChainIter != MRU.end(); + ++J, ++ChainIter) { + std::optional Offset = + getConstantOffset(getLoadStorePointerOperand(ChainIter->first), + getLoadStorePointerOperand(I)); + if (Offset.has_value()) { + // `Offset` might not have the expected number of bits, if e.g. AS has a + // different number of bits than opaque pointers. + ChainIter->second.push_back( + ChainElem{I, Offset.value().sextOrTrunc(ASPtrBits)}); + // Move ChainIter to the front of the MRU list. + MRU.remove(*ChainIter); + MRU.push_front(*ChainIter); + MatchFound = true; + break; + } } - } - assert(LoadTy && "Can't determine LoadInst type from chain"); - unsigned Sz = DL.getTypeSizeInBits(LoadTy); - unsigned AS = L0->getPointerAddressSpace(); - unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); - unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); - Align Alignment = L0->getAlign(); - - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - - ArrayRef NewChain = getVectorizablePrefix(Chain); - if (NewChain.empty()) { - // No vectorization possible. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - return false; - } - if (NewChain.size() == 1) { - // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(NewChain.front()); - return false; - } - - // Update Chain to the valid vectorizable subchain. - Chain = NewChain; - ChainSize = Chain.size(); - - // Check if it's legal to vectorize this chain. If not, split the chain and - // try again. - unsigned EltSzInBytes = Sz / 8; - unsigned SzInBytes = EltSzInBytes * ChainSize; - VectorType *VecTy; - auto *VecLoadTy = dyn_cast(LoadTy); - if (VecLoadTy) - VecTy = FixedVectorType::get(LoadTy->getScalarType(), - Chain.size() * VecLoadTy->getNumElements()); - else - VecTy = FixedVectorType::get(LoadTy, Chain.size()); - - // If it's more than the max vector size or the target has a better - // vector factor, break it into two pieces. - unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); - if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); - bool Vectorized = false; - Vectorized |= - vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed); - Vectorized |= - vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); - return Vectorized; - } - - // We won't try again to vectorize the elements of the chain, regardless of - // whether we succeed below. - InstructionsProcessed->insert(Chain.begin(), Chain.end()); - - // If the load is going to be misaligned, don't vectorize it. - unsigned RelativeSpeed; - if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { - if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { - unsigned SpeedBefore; - accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); - if (SpeedBefore > RelativeSpeed) - return false; - - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; + if (!MatchFound) { + APInt ZeroOffset(ASPtrBits, 0); + InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I); + E->second.push_back(ChainElem{I, ZeroOffset}); + MRU.push_front(*E); + Chains.insert(E); } - - Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), - Align(StackAdjustedAlignment), - DL, L0, nullptr, &DT); - if (NewAlign >= Alignment) - Alignment = NewAlign; - else - return false; } - if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - bool Vectorized = false; - Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); - Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); - return Vectorized; - } - - LLVM_DEBUG({ - dbgs() << "LSV: Loads to vectorize:\n"; - for (Instruction *I : Chain) - I->dump(); - }); + std::vector Ret; + Ret.reserve(Chains.size()); + // Iterate over MRU rather than Chains so the order is deterministic. + for (auto &E : MRU) + if (E.second.size() > 1) + Ret.push_back(std::move(E.second)); + return Ret; +} - // getVectorizablePrefix already computed getBoundaryInstrs. The value of - // Last may have changed since then, but the value of First won't have. If it - // matters, we could compute getBoundaryInstrs only once and reuse it here. - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - Builder.SetInsertPoint(&*First); - - Value *Bitcast = - Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - LoadInst *LI = - Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); - propagateMetadata(LI, Chain); - - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *CV = Chain[I]; - Value *V; - if (VecLoadTy) { - // Extract a subvector using shufflevector. - unsigned VecWidth = VecLoadTy->getNumElements(); - auto Mask = - llvm::to_vector<8>(llvm::seq(I * VecWidth, (I + 1) * VecWidth)); - V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); - } else { - V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); - } +std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, + unsigned Depth) { + LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA + << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n"); + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); + APInt OffsetA(OffsetBitWidth, 0); + APInt OffsetB(OffsetBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + return std::nullopt; - if (V->getType() != CV->getType()) { - V = Builder.CreateBitOrPointerCast(V, CV->getType()); - } + // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets + // should properly handle a possible overflow and the value should fit into + // the smallest data type used in the cast/gep chain. + assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && + OffsetB.getSignificantBits() <= NewPtrBitWidth); - // Replace the old instruction. - CV->replaceAllUsesWith(V); + OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); + OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); + if (PtrA == PtrB) + return OffsetB - OffsetA; + + // Try to compute B - A. + const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA)); + if (DistScev != SE.getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n"); + ConstantRange DistRange = SE.getSignedRange(DistScev); + if (DistRange.isSingleElement()) + return OffsetB - OffsetA + *DistRange.getSingleElement(); } - - // Since we might have opaque pointers we might end up using the pointer - // operand of the first load (wrt. memory loaded) for the vector load. Since - // this first load might not be the first in the block we potentially need to - // reorder the pointer operand (and its operands). If we have a bitcast though - // it might be before the load and should be the reorder start instruction. - // "Might" because for opaque pointers the "bitcast" is just the first loads - // pointer operand, as oppposed to something we inserted at the right position - // ourselves. - Instruction *BCInst = dyn_cast(Bitcast); - reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI); - - eraseInstructions(Chain); - - ++NumVectorInstructions; - NumScalarsVectorized += Chain.size(); - return true; -} - -bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, - Align Alignment, unsigned &RelativeSpeed) { - RelativeSpeed = 0; - if (Alignment.value() % SzInBytes == 0) - return false; - - bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), - SzInBytes * 8, AddressSpace, - Alignment, &RelativeSpeed); - LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows - << " with relative speed = " << RelativeSpeed << '\n';); - return !Allows || !RelativeSpeed; + std::optional Diff = gtConstantOffsetComplexAddrs(PtrA, PtrB, Depth); + if (Diff.has_value()) + return OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()); + return std::nullopt; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index ac06fa465b54..6c7e4fe3f01c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1277,26 +1277,26 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s9, 31 -; GFX8-NEXT: s_ashr_i32 s16, s13, 31 -; GFX8-NEXT: s_add_u32 s0, s8, s2 -; GFX8-NEXT: s_addc_u32 s1, s9, s2 -; GFX8-NEXT: s_add_u32 s6, s12, s16 +; GFX8-NEXT: s_ashr_i32 s4, s13, 31 +; GFX8-NEXT: s_ashr_i32 s16, s1, 31 +; GFX8-NEXT: s_add_u32 s12, s12, s4 +; GFX8-NEXT: s_addc_u32 s13, s13, s4 +; GFX8-NEXT: s_add_u32 s0, s0, s16 ; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: s_addc_u32 s7, s13, s16 -; GFX8-NEXT: s_xor_b64 s[8:9], s[6:7], s[16:17] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] +; GFX8-NEXT: s_addc_u32 s1, s1, s16 +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_mov_b32 s5, s4 +; GFX8-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s6, 0, s8 -; GFX8-NEXT: s_subb_u32 s7, 0, s9 -; GFX8-NEXT: s_xor_b64 s[18:19], s[2:3], s[16:17] +; GFX8-NEXT: s_sub_u32 s18, 0, s6 +; GFX8-NEXT: s_subb_u32 s19, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1304,12 +1304,10 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: s_ashr_i32 s16, s15, 31 -; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1332,14 +1330,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] +; GFX8-NEXT: s_ashr_i32 s16, s3, 31 +; GFX8-NEXT: s_mov_b32 s17, s16 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 @@ -1377,46 +1377,46 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_ashr_i32 s12, s11, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_ashr_i32 s12, s15, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s10, s12 -; GFX8-NEXT: s_addc_u32 s1, s11, s12 -; GFX8-NEXT: s_add_u32 s10, s14, s16 -; GFX8-NEXT: s_addc_u32 s11, s15, s16 -; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s11 +; GFX8-NEXT: s_add_u32 s0, s14, s12 +; GFX8-NEXT: s_addc_u32 s1, s15, s12 +; GFX8-NEXT: s_add_u32 s2, s2, s16 +; GFX8-NEXT: s_addc_u32 s3, s3, s16 +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s6, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 @@ -1431,15 +1431,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 ; GFX8-NEXT: s_mov_b32 s13, s12 -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] -; GFX8-NEXT: s_sub_u32 s3, 0, s10 +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX8-NEXT: s_sub_u32 s5, 0, s2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v12 -; GFX8-NEXT: s_subb_u32 s20, 0, s11 +; GFX8-NEXT: s_subb_u32 s20, 0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v5, v[1:2] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v15, s[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2] @@ -1468,22 +1468,22 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v13, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v8, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s18, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v5, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] ; GFX8-NEXT: v_xor_b32_e32 v9, s19, v10 ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s18, v1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s2, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 ; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 ; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 ; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s2, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 @@ -1503,56 +1503,56 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s2 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v4 +; GFX8-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NEXT: v_mul_lo_u32 v7, s7, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s6, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s7, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s6, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v8, 0 +; GFX8-NEXT: v_mul_hi_u32 v9, s7, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s10, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s11, v8, v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s10, v2 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc ; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 ; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v12 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s2, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 ; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -1578,38 +1578,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s9, 31 -; GFX9-NEXT: s_ashr_i32 s16, s13, 31 -; GFX9-NEXT: s_add_u32 s0, s8, s2 -; GFX9-NEXT: s_addc_u32 s1, s9, s2 -; GFX9-NEXT: s_add_u32 s6, s12, s16 +; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_ashr_i32 s16, s1, 31 +; GFX9-NEXT: s_add_u32 s12, s12, s4 +; GFX9-NEXT: s_addc_u32 s13, s13, s4 +; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s7, s13, s16 -; GFX9-NEXT: s_xor_b64 s[8:9], s[6:7], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s1, s1, s16 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s6, 0, s8 -; GFX9-NEXT: s_subb_u32 s7, 0, s9 -; GFX9-NEXT: s_xor_b64 s[18:19], s[2:3], s[16:17] +; GFX9-NEXT: s_sub_u32 s18, 0, s6 +; GFX9-NEXT: s_subb_u32 s19, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1617,12 +1616,10 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: s_ashr_i32 s16, s15, 31 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1644,15 +1641,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] +; GFX9-NEXT: s_ashr_i32 s16, s3, 31 +; GFX9-NEXT: s_mov_b32 s17, s16 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1688,47 +1687,47 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3] -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s12, s15, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s10, s12 -; GFX9-NEXT: s_addc_u32 s1, s11, s12 -; GFX9-NEXT: s_add_u32 s10, s14, s16 -; GFX9-NEXT: s_addc_u32 s11, s15, s16 -; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s11 +; GFX9-NEXT: s_add_u32 s0, s14, s12 +; GFX9-NEXT: s_addc_u32 s1, s15, s12 +; GFX9-NEXT: s_add_u32 s2, s2, s16 +; GFX9-NEXT: s_addc_u32 s3, s3, s16 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 -; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s8, v9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s6, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 @@ -1743,14 +1742,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 ; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] -; GFX9-NEXT: s_sub_u32 s3, 0, s10 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v14, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX9-NEXT: s_sub_u32 s5, 0, s2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_subb_u32 s14, 0, s11 +; GFX9-NEXT: s_subb_u32 s14, 0, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v13, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc ; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] @@ -1778,23 +1777,23 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 ; GFX9-NEXT: v_add_co_u32_e64 v11, s[0:1], v14, v1 ; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v11, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: v_xor_b32_e32 v9, s18, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_xor_b32_e32 v7, s19, v7 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v10, s19 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v10, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v8 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 ; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s2, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 @@ -1813,55 +1812,55 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s7, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s6, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s7, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s10, v11, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s2, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5 ; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s10, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s11, v11, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s7, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s10, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s2, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 ; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v13 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v12 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s2, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 ; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc @@ -1887,45 +1886,46 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 ; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s12, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s9, 31 -; GFX10-NEXT: s_ashr_i32 s6, s13, 31 -; GFX10-NEXT: s_add_u32 s0, s8, s2 -; GFX10-NEXT: s_addc_u32 s1, s9, s2 -; GFX10-NEXT: s_add_u32 s8, s12, s6 -; GFX10-NEXT: s_mov_b32 s7, s6 -; GFX10-NEXT: s_addc_u32 s9, s13, s6 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX10-NEXT: s_sub_u32 s20, 0, s8 -; GFX10-NEXT: s_subb_u32 s21, 0, s9 -; GFX10-NEXT: s_ashr_i32 s12, s11, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX10-NEXT: s_xor_b64 s[18:19], s[2:3], s[6:7] -; GFX10-NEXT: s_ashr_i32 s16, s15, 31 +; GFX10-NEXT: s_ashr_i32 s16, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s13, 31 +; GFX10-NEXT: s_mov_b32 s17, s16 +; GFX10-NEXT: s_add_u32 s12, s12, s4 +; GFX10-NEXT: s_addc_u32 s13, s13, s4 +; GFX10-NEXT: s_add_u32 s0, s0, s16 +; GFX10-NEXT: s_addc_u32 s1, s1, s16 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] +; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX10-NEXT: s_sub_u32 s20, 0, s6 +; GFX10-NEXT: s_subb_u32 s21, 0, s7 +; GFX10-NEXT: s_ashr_i32 s12, s15, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] +; GFX10-NEXT: s_ashr_i32 s16, s3, 31 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_add_u32 s6, s10, s12 -; GFX10-NEXT: s_addc_u32 s7, s11, s12 -; GFX10-NEXT: s_add_u32 s10, s14, s16 +; GFX10-NEXT: s_add_u32 s14, s14, s12 +; GFX10-NEXT: s_addc_u32 s15, s15, s12 +; GFX10-NEXT: s_add_u32 s2, s2, s16 ; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_addc_u32 s11, s15, s16 +; GFX10-NEXT: s_addc_u32 s3, s3, s16 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13] +; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1941,62 +1941,62 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 ; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s20, v6, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s20, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v8, s21, v6 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v4 -; GFX10-NEXT: s_sub_u32 s3, 0, s10 -; GFX10-NEXT: s_subb_u32 s6, 0, s11 +; GFX10-NEXT: s_sub_u32 s5, 0, s2 +; GFX10-NEXT: s_subb_u32 s22, 0, s3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX10-NEXT: v_mul_lo_u32 v9, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v9, s5, v3 ; GFX10-NEXT: v_add3_u32 v7, v1, v7, v8 ; GFX10-NEXT: v_mul_lo_u32 v10, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, s3, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v8, s6, v4 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v4, 0 +; GFX10-NEXT: v_mul_lo_u32 v8, s22, v4 ; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 ; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 ; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX10-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX10-NEXT: v_add_co_u32 v10, s7, v10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v13, v0 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v0, s23, v13, v0 ; GFX10-NEXT: v_mul_lo_u32 v8, v3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 ; GFX10-NEXT: v_mul_lo_u32 v15, v4, v2 -; GFX10-NEXT: v_add_co_u32 v10, s7, v10, v11 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v11 ; GFX10-NEXT: v_mul_hi_u32 v9, v4, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v0, s23, v0, v14 ; GFX10-NEXT: v_mul_lo_u32 v14, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10 -; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 +; GFX10-NEXT: v_add_co_u32 v8, s23, v8, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 ; GFX10-NEXT: v_mul_hi_u32 v16, v4, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_add_co_u32 v1, s7, v14, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v0, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v9, s7, v1, v16 +; GFX10-NEXT: v_add_co_u32 v1, s23, v14, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v0, s23, v0, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v8, s23, v8, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v9, s23, v1, v16 ; GFX10-NEXT: v_add3_u32 v7, v11, v10, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s7, s20, v6, 0 -; GFX10-NEXT: v_add_co_u32 v7, s7, v9, v8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s20, v6, 0 +; GFX10-NEXT: v_add_co_u32 v7, s23, v9, v8 ; GFX10-NEXT: v_mul_lo_u32 v9, s21, v6 ; GFX10-NEXT: v_mul_lo_u32 v11, s20, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 ; GFX10-NEXT: v_add3_u32 v2, v10, v8, v2 ; GFX10-NEXT: v_mul_lo_u32 v8, v5, v0 @@ -2005,74 +2005,73 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, s3, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s20, s5, v4, 0 +; GFX10-NEXT: v_mul_lo_u32 v9, s22, v4 +; GFX10-NEXT: v_mul_lo_u32 v11, s5, v3 ; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 ; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v12 +; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v12 ; GFX10-NEXT: v_mul_lo_u32 v15, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v4, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v0, s3, v13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v0, s5, v13, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 ; GFX10-NEXT: v_mul_lo_u32 v12, v4, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 ; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 ; GFX10-NEXT: v_mul_lo_u32 v13, v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v11, v10 ; GFX10-NEXT: v_mul_hi_u32 v9, v4, v2 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12 +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v12 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 ; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v1, s3, v13, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v13, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, s3, v11, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v1, s3, v1, v9 +; GFX10-NEXT: v_add_co_u32 v8, s5, v11, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v9 ; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 ; GFX10-NEXT: v_mul_lo_u32 v9, s0, v5 ; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX10-NEXT: v_mul_lo_u32 v11, s1, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 ; GFX10-NEXT: v_mul_hi_u32 v12, s0, v5 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v5 -; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10 -; GFX10-NEXT: v_add_co_u32 v0, s6, v7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v10, s5, v11, v10 +; GFX10-NEXT: v_add_co_u32 v0, s20, v7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v0 -; GFX10-NEXT: v_add_co_u32 v8, s3, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v8, s5, v1, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v11 -; GFX10-NEXT: v_add_co_u32 v9, s3, v10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v6 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 ; GFX10-NEXT: v_add3_u32 v5, v7, v0, v5 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v8, s14, v4 ; GFX10-NEXT: v_add3_u32 v2, v6, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s8, v9, 0 -; GFX10-NEXT: v_mul_lo_u32 v6, s9, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s8, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v9, 0 +; GFX10-NEXT: v_mul_lo_u32 v6, s7, v9 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, s15, v4 ; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4 @@ -2084,23 +2083,23 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_sub_nc_u32_e32 v12, s1, v1 ; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v12, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v12, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s8 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v0, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v14 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v19, s0, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v20, s0, 0, v7, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v14 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v17, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v3, v10 ; GFX10-NEXT: v_mul_hi_u32 v10, s14, v2 @@ -2117,14 +2116,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v8, v10 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s8 +; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0 ; GFX10-NEXT: v_add3_u32 v2, v3, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v7, s10, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, s11, v4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v4, 0 +; GFX10-NEXT: v_mul_lo_u32 v7, s2, v2 +; GFX10-NEXT: v_mul_lo_u32 v11, s3, v4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 @@ -2139,33 +2138,33 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v1 ; GFX10-NEXT: v_xor_b32_e32 v0, s18, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v9 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v9 ; GFX10-NEXT: v_xor_b32_e32 v3, s19, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s2, v6 +; GFX10-NEXT: v_xor_b32_e32 v6, s4, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s10 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s2, v7 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v9 +; GFX10-NEXT: v_xor_b32_e32 v3, s4, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v13 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v14, s0, v4, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 ; GFX10-NEXT: v_add_co_u32 v11, s0, v14, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v15, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s10 +; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 @@ -2177,9 +2176,9 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v7, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s2 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v6, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v2 ; GFX10-NEXT: v_xor_b32_e32 v8, s12, v8 ; GFX10-NEXT: v_xor_b32_e32 v7, s12, v7 @@ -2187,9 +2186,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 4111f0402db9..56943531ba8a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -985,8 +985,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1255,7 +1255,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1264,7 +1264,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1325,6 +1325,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 @@ -1510,14 +1511,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1616,11 +1616,11 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_lo_u32 v10, v5, v1 ; GFX10-NEXT: v_mul_lo_u32 v11, v4, v1 ; GFX10-NEXT: v_mul_hi_u32 v14, v5, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v15, v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v16, v6, v3 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, v6, v3 +; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 ; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v11, v0 @@ -1642,65 +1642,66 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v15, v7 +; GFX10-NEXT: v_mul_hi_u32 v3, v6, v3 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v16, v10 ; GFX10-NEXT: v_add3_u32 v1, v11, v9, v1 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v16, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0 ; GFX10-NEXT: v_add3_u32 v3, v10, v7, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX10-NEXT: v_mul_lo_u32 v7, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v10, s9, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s9, v1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v6, v3, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v6, s8, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v10, v0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v6, s11, v2 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v7, v4 +; GFX10-NEXT: v_mul_lo_u32 v6, s11, v2 ; GFX10-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, s11, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 ; GFX10-NEXT: v_mul_hi_u32 v8, s10, v2 ; GFX10-NEXT: v_add_co_u32 v4, s0, v0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s10, v3 +; GFX10-NEXT: v_mul_lo_u32 v9, s11, v3 +; GFX10-NEXT: v_mul_hi_u32 v10, s10, v3 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s12, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v12, s13, v4 -; GFX10-NEXT: v_mul_lo_u32 v13, s12, v5 -; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v11, s13, v4 +; GFX10-NEXT: v_mul_lo_u32 v12, s12, v5 +; GFX10-NEXT: v_add_co_u32 v2, s0, v9, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v11 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v1, v1, v13, v12 +; GFX10-NEXT: v_add3_u32 v1, v1, v12, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v7, v6 ; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v8 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v1 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, s8, v0 +; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, s8, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s0, s9, v1, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v10 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, s12 +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v9, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v11 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v0, vcc_lo @@ -1747,34 +1748,33 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_sub_co_u32 v14, s0, v7, s14 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s1 ; GFX10-NEXT: v_add_co_u32 v16, s1, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 -; GFX10-NEXT: v_add_co_u32 v10, s1, v16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s1 +; GFX10-NEXT: v_add_co_u32 v9, s1, v16, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_sub_co_u32 v8, s1, v14, s14 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v16, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v16, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v18, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v14, v8, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v10, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v9, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v16, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v14, s1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 6a9ad2a0f6da..9f5389120fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -13,16 +13,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit $scc + ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) @@ -32,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FC, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -45,10 +46,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr22, $sgpr20_sgpr21, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr20 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF @@ -58,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr20, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -66,7 +67,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec @@ -200,7 +201,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit $scc + ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2 + 4, target-flags(amdgpu-gotprel32-hi) @f2 + 12, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_LOAD_DWORDX2_IMM killed renamable $sgpr12_sgpr13, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; GFX90A-NEXT: $sgpr12 = COPY killed renamable $sgpr14 @@ -365,7 +366,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -412,7 +413,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -463,7 +464,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -512,7 +513,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -564,10 +565,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit $scc + ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec @@ -614,7 +615,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -665,7 +666,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc @@ -679,7 +680,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc @@ -707,13 +708,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 killed renamable $sgpr33, 65560, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) @@ -736,7 +737,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -774,9 +775,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.3, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec @@ -788,9 +789,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr20:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 @@ -825,21 +826,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FC, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.3, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr20 = S_MOV_B32 0 + ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} @@ -986,13 +986,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.4, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 50842668dc05..48acc17a1683 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,24 +6,26 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 ; CHECK-NEXT: s_add_u32 s24, s24, s7 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s0, 8 +; CHECK-NEXT: s_bitcmp1_b32 s2, 0 +; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s2, 8 ; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s0, 16 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: s_bitcmp1_b32 s2, 16 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s1, 8 +; CHECK-NEXT: s_bitcmp1_b32 s6, 8 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[16:17] ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 6fb204351a99..ee377bacfb16 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -25,79 +25,70 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 { ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLASTSCR-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off -; GFX9-FLASTSCR-NEXT: scratch_load_ushort v3, v1, off +; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 +; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX10-FLASTSCR: ; %bb.0: ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLASTSCR-NEXT: v_add_nc_u32_e32 v1, 2, v0 -; GFX10-FLASTSCR-NEXT: s_clause 0x1 -; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off -; GFX10-FLASTSCR-NEXT: scratch_load_ushort v3, v1, off +; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_u16 v0, v0, off -; GFX11-NEXT: scratch_load_u16 v1, v1, off +; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX11-FLASTSCR: ; %bb.0: ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLASTSCR-NEXT: v_add_nc_u32_e32 v1, 2, v0 -; GFX11-FLASTSCR-NEXT: s_clause 0x1 -; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off -; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v1, off +; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 2 @@ -125,32 +116,24 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2 -; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_store_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-FLASTSCR-NEXT: v_add_u32_e32 v0, 2, v1 -; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v2, off -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-FLASTSCR-NEXT: scratch_store_short v0, v1, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -158,10 +141,8 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 2 -; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -169,11 +150,8 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX10-FLASTSCR: ; %bb.0: ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-FLASTSCR-NEXT: v_add_nc_u32_e32 v2, 2, v1 -; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v3, 2 -; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v0, off -; GFX10-FLASTSCR-NEXT: scratch_store_short v2, v3, off +; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -181,11 +159,8 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b16 v1, v0, off -; GFX11-NEXT: scratch_store_b16 v2, v3, off +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: scratch_store_b32 v1, v0, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -193,11 +168,8 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX11-FLASTSCR: ; %bb.0: ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLASTSCR-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2 -; GFX11-FLASTSCR-NEXT: v_add_nc_u32_e32 v2, 2, v1 -; GFX11-FLASTSCR-NEXT: s_clause 0x1 -; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v0, off -; GFX11-FLASTSCR-NEXT: scratch_store_b16 v2, v3, off +; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 5667d330fd25..59a09a8d77a2 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -3220,19 +3220,18 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitcmp1_b32 s4, 0 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] +; SI-NEXT: s_bitcmp1_b32 s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3] ; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3240,19 +3239,18 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 0 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] +; VI-NEXT: s_bitcmp1_b32 s6, 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 5a002364ab23..ae470efc92fe 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -110,13 +110,13 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s1, s[0:1], 0x64 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s0, s2, 16 -; GCN-NEXT: s_mov_b32 m0, s1 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -124,13 +124,13 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 924593a1d099..c58dbd6bd120 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -497,42 +497,38 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind { define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: s_cmp_lg_u32 s8, 1 +; SI-NEXT: s_cmp_lg_u32 s2, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -658,8 +654,8 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -681,8 +677,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1022,37 +1018,33 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_cmp_lg_u32 s8, 1 -; SI-NEXT: s_cselect_b32 s0, s3, 5 -; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b32 s1, s2, 5 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_lg_u32 s2, 1 +; SI-NEXT: s_cselect_b32 s1, s1, 5 +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cselect_b32 s0, s0, 5 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_cmp_lg_u32 s8, 1 -; VI-NEXT: s_cselect_b32 s0, s3, 5 -; VI-NEXT: s_cmp_lg_u32 s8, 0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b32 s1, s2, 5 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_cmp_lg_u32 s2, 1 +; VI-NEXT: s_cselect_b32 s1, s1, 5 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 5, i32 %b @@ -1162,8 +1154,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,7 +1167,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1184,8 +1176,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1197,7 +1189,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll index 17fc7f6c1c95..a94ff78ac18c 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -279,9 +279,10 @@ define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { ; Make sure we don't think the alignment will increase if the base address isn't an alloca define void @private_store_2xi16_align2_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 { ; CHECK-LABEL: @private_store_2xi16_align2_not_alloca( -; CHECK-NEXT: [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1 -; CHECK-NEXT: store i16 1, ptr addrspace(5) [[R]], align 2 -; CHECK-NEXT: store i16 2, ptr addrspace(5) [[GEP_R]], align 2 +; ALIGNED-NEXT: [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1 +; ALIGNED-NEXT: store i16 1, ptr addrspace(5) [[R]], align 2 +; ALIGNED-NEXT: store i16 2, ptr addrspace(5) [[GEP_R]], align 2 +; UNALIGNED-NEXT:store <2 x i16> ; CHECK-NEXT: ret void ; %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1 @@ -309,11 +310,12 @@ define void @private_store_2xi16_align1_not_alloca(ptr addrspace(5) %p, ptr addr define i32 @private_load_2xi16_align2_not_alloca(ptr addrspace(5) %p) #0 { ; CHECK-LABEL: @private_load_2xi16_align2_not_alloca( -; CHECK-NEXT: [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1 -; CHECK-NEXT: [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2 -; CHECK-NEXT: [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2 -; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32 -; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32 +; ALIGNED-NEXT: [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1 +; ALIGNED-NEXT: [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2 +; ALIGNED-NEXT: [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2 +; UNALIGNED-NEXT:load <2 x i16> +; CHECK: [[ZEXT_0:%.*]] = zext i16 +; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i16 ; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]] ; CHECK-NEXT: ret i32 [[OR]] diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll index 8629869fc2be..f13c2dfb0859 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -85,21 +85,14 @@ define float @insert_store_point_alias(ptr addrspace(1) nocapture %a, i64 %idx) ret float %x } -; Here we have four stores, with an aliasing load before the last one. We -; could vectorize two of the stores before the load (although we currently -; don't), but the important thing is that we *don't* sink the store to -; a[idx + 1] below the load. +; Here we have four stores, with an aliasing load before the last one. We can +; vectorize three of the stores before the load, but the important thing is that +; we *don't* sink the store to a[idx + 1] below the load. ; ; CHECK-LABEL: @insert_store_point_alias_ooo -; CHECK: store float -; CHECK-SAME: %a.idx.3 -; CHECK: store float -; CHECK-SAME: %a.idx.1 -; CHECK: store float -; CHECK-SAME: %a.idx.2 +; CHECK: store <3 x float>{{.*}} %a.idx.1 ; CHECK: load float, ptr addrspace(1) %a.idx.2 -; CHECK: store float -; CHECK-SAME: %a.idx +; CHECK: store float{{.*}} %a.idx define float @insert_store_point_alias_ooo(ptr addrspace(1) nocapture %a, i64 %idx) { %a.idx = getelementptr inbounds float, ptr addrspace(1) %a, i64 %idx %a.idx.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx, i64 1 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll index cd4717f5152e..64560e3ca285 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -57,10 +57,17 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1( } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( -; ALL: store i32 -; ALL: store i32 -; ALL: store i32 -; ALL: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> +; ELT16-UNALIGNED: store <4 x i32> define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(ptr addrspace(5) %out) #0 { %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1 %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2 @@ -117,8 +124,9 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(ptr add } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( -; ALL: store i16 -; ALL: store i16 +; ALIGNED: store i16 +; ALIGNED: store i16 +; UNALIGNED: store <2 x i16> define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(ptr addrspace(5) %out) #0 { %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll index dc7ae189626c..20e4ba2bd72d 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,5 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -26,25 +26,17 @@ define amdgpu_kernel void @no_crash(i32 %arg) { ret void } -; Check adjiacent memory locations are properly matched and the +; Check adjacent memory locations are properly matched and the ; longest chain vectorized ; GCN-LABEL: @interleave_get_longest -; GFX7: load <2 x i32> -; GFX7: load i32 -; GFX7: store <2 x i32> zeroinitializer -; GFX7: load i32 -; GFX7: load <2 x i32> -; GFX7: load i32 -; GFX7: load i32 - -; GFX9: load <4 x i32> -; GFX9: load i32 -; GFX9: store <2 x i32> zeroinitializer -; GFX9: load i32 -; GFX9: load i32 -; GFX9: load i32 +; GCN: load <2 x i32>{{.*}} %tmp1 +; GCN: store <2 x i32> zeroinitializer{{.*}} %tmp1 +; GCN: load <2 x i32>{{.*}} %tmp2 +; GCN: load <2 x i32>{{.*}} %tmp4 +; GCN: load i32{{.*}} %tmp5 +; GCN: load i32{{.*}} %tmp5 define amdgpu_kernel void @interleave_get_longest(i32 %arg) { %a1 = add i32 %arg, 1 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll index 8e5a20ec8ae1..13bc515eb068 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -42,6 +42,54 @@ entry: ret void } +; CHECK-LABEL: @merge_ptr_i32( +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_ptr_i32(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 { +entry: + %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0 + %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 1 + %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2 + + %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0 + %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 1 + %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2 + + %ld.0 = load i32, ptr addrspace(3) %b.0, align 16 + %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 4 + %ld.2 = load <2 x i32>, ptr addrspace(3) %b.2, align 8 + + store i32 0, ptr addrspace(3) %a.0, align 16 + store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 4 + store <2 x i32> , ptr addrspace(3) %a.2, align 8 + + ret void +} + +; CHECK-LABEL: @merge_ptr_i32_vec_first( +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_ptr_i32_vec_first(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 { +entry: + %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0 + %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2 + %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 3 + + %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0 + %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2 + %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 3 + + %ld.0 = load <2 x i32>, ptr addrspace(3) %b.0, align 16 + %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 8 + %ld.2 = load i32, ptr addrspace(3) %b.2, align 4 + + store <2 x i32> , ptr addrspace(3) %a.0, align 16 + store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 8 + store i32 0, ptr addrspace(3) %a.2, align 4 + + ret void +} + ; CHECK-LABEL: @merge_load_i64_ptr64( ; CHECK: load <2 x i64> ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll index 8a4e4f00ab02..f3575e5edd76 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll @@ -82,7 +82,7 @@ entry: %a.ascast = addrspacecast ptr addrspace(5) %p to ptr %b.ascast = addrspacecast ptr addrspace(5) %gep2 to ptr %tmp1 = load i8, ptr %a.ascast, align 1 - %tmp2 = load i8, ptr %b.ascast, align 1 + %tmp2 = load i8, ptr %b.ascast, align 2 unreachable } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll index 92efbb238fe9..3c2b9933c59a 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll @@ -1,10 +1,10 @@ ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s define void @ldg_f16(ptr nocapture align 16 %rd0) { - %load1 = load <2 x half>, ptr %rd0, align 4 + %load1 = load <2 x half>, ptr %rd0, align 16 %p1 = fcmp ogt <2 x half> %load1, zeroinitializer %s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer - store <2 x half> %s1, ptr %rd0, align 4 + store <2 x half> %s1, ptr %rd0, align 16 %in2 = getelementptr half, ptr %rd0, i64 2 %load2 = load <2 x half>, ptr %in2, align 4 %p2 = fcmp ogt <2 x half> %load2, zeroinitializer diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll new file mode 100644 index 000000000000..11063dfeca54 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll @@ -0,0 +1,1136 @@ +; This is an end-to-end test that checks that LSV succeeds at vectorizing a +; large program with many loads. +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s > %t +; RUN: grep 'load i8' < %t | count 18 +; RUN: grep 'load <2 x i8>' < %t | count 9 +; RUN: grep 'load <4 x i8>' < %t | count 27 + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 +declare float @llvm.ceil.f32(float) #0 +declare i32 @llvm.smax.i32(i32, i32) #0 +declare i32 @llvm.umin.i32(i32, i32) #0 + +define void @many_loads(ptr noalias readonly align 128 dereferenceable(5111808) %arg0, ptr noalias nocapture readonly align 128 dereferenceable(29952) %arg1, ptr noalias nocapture readonly align 128 dereferenceable(2664) %arg2, ptr noalias nocapture readonly align 128 dereferenceable(888) %arg3, ptr noalias nocapture writeonly align 128 dereferenceable(17731584) %arg4) local_unnamed_addr #1 { +entry: + %arg41104 = addrspacecast ptr %arg4 to ptr addrspace(1) + %arg31102 = addrspacecast ptr %arg3 to ptr addrspace(1) + %arg21100 = addrspacecast ptr %arg2 to ptr addrspace(1) + %arg11098 = addrspacecast ptr %arg1 to ptr addrspace(1) + %arg01096 = addrspacecast ptr %arg0 to ptr addrspace(1) + %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !140 + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !141 + %2 = shl nuw nsw i32 %0, 6 + %linear_index = or i32 %2, %1 + %linear_index_base = shl nuw nsw i32 %linear_index, 4 + %.urem = add nsw i32 %linear_index, -554112 + %.cmp = icmp ult i32 %linear_index, 554112 + %3 = select i1 %.cmp, i32 %linear_index, i32 %.urem + %4 = urem i32 %linear_index, 2496 + %.lhs.trunc = trunc i32 %0 to i16 + %5 = udiv i16 %.lhs.trunc, 39 + %.zext = zext i16 %5 to i32 + %6 = mul nuw nsw i32 %.zext, 2496 + %7 = add nuw nsw i32 %6, %4 + %8 = udiv i32 %7, 222 + %9 = mul i32 %8, 222 + %.decomposed = sub i32 %7, %9 + %10 = mul nuw nsw i32 %8, 3 + %11 = mul nuw nsw i32 %.decomposed, 3 + %12 = uitofp i32 %8 to float + %add.26 = fadd float %12, -1.000000e+00 + %13 = tail call float @llvm.ceil.f32(float %add.26) + %14 = fcmp ole float %13, 0.000000e+00 + %15 = select i1 %14, float 0.000000e+00, float %13 + %16 = fcmp oge float %15, 2.493000e+03 + %17 = select i1 %16, float 2.493000e+03, float %15 + %.inv = fcmp ole float %17, 0xC1E0000000000000 + %18 = select i1 %.inv, float 0xC1E0000000000000, float %17 + %19 = fptosi float %18 to i32 + %20 = fcmp oge float %17, 0x41E0000000000000 + %21 = tail call i32 @llvm.smax.i32(i32 %19, i32 0) + %22 = tail call i32 @llvm.umin.i32(i32 %21, i32 2493) + %23 = select i1 %20, i32 2493, i32 %22 + %24 = uitofp i32 %.decomposed to float + %add.3613 = fadd float %24, 5.000000e-01 + %multiply.3915 = fmul float %add.3613, 0x3FE27350C0000000 + %add.4217 = fadd float %multiply.3915, -1.500000e+00 + %25 = tail call float @llvm.ceil.f32(float %add.4217) + %26 = fcmp ole float %25, 0.000000e+00 + %27 = select i1 %26, float 0.000000e+00, float %25 + %28 = fcmp oge float %27, 1.250000e+02 + %29 = select i1 %28, float 1.250000e+02, float %27 + %.inv821 = fcmp ole float %29, 0xC1E0000000000000 + %30 = select i1 %.inv821, float 0xC1E0000000000000, float %29 + %31 = fptosi float %30 to i32 + %32 = fcmp oge float %29, 0x41E0000000000000 + %33 = tail call i32 @llvm.smax.i32(i32 %31, i32 0) + %34 = fcmp uno float %29, 0.000000e+00 + %35 = tail call i32 @llvm.umin.i32(i32 %33, i32 125) + %36 = select i1 %32, i32 125, i32 %35 + %37 = select i1 %34, i32 0, i32 %36 + %.lhs.trunc1053 = trunc i32 %11 to i16 + %38 = udiv i16 %.lhs.trunc1053, 3 + %39 = mul i16 %38, 3 + %.decomposed1089 = sub i16 %.lhs.trunc1053, %39 + %40 = zext i16 %38 to i64 + %41 = zext i16 %.decomposed1089 to i64 + %42 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %40, i64 %41 + %43 = load float, ptr addrspace(1) %42, align 4, !invariant.load !142 + %44 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %40 + %45 = load float, ptr addrspace(1) %44, align 4, !invariant.load !142 + %divide.6 = fdiv float %43, %45 + %46 = zext i32 %10 to i64 + %47 = getelementptr inbounds [7488 x float], ptr addrspace(1) %arg11098, i64 0, i64 %46 + %48 = load float, ptr addrspace(1) %47, align 4, !invariant.load !142 + %multiply.10 = fmul float %divide.6, %48 + %49 = zext i32 %23 to i64 + %50 = zext i32 %37 to i64 + %51 = getelementptr inbounds [1 x [4 x [2496 x [128 x [4 x i8]]]]], ptr addrspace(1) %arg01096, i64 0, i64 0, i64 0, i64 %49, i64 %50, i64 0 + %52 = load i8, ptr addrspace(1) %51, align 4, !invariant.load !142 + %53 = sitofp i8 %52 to float + %multiply.18 = fmul float %53, 0x3FC3BF2820000000 + %multiply.53 = fmul float %multiply.10, %multiply.18 + %add.57.i = fadd float %multiply.53, 0.000000e+00 + %.lhs.trunc1053.1 = add nuw nsw i16 %.lhs.trunc1053, 1 + %54 = udiv i16 %.lhs.trunc1053.1, 3 + %55 = mul i16 %54, 3 + %.decomposed1090 = sub i16 %.lhs.trunc1053.1, %55 + %56 = zext i16 %54 to i64 + %57 = zext i16 %.decomposed1090 to i64 + %58 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %56, i64 %57 + %59 = load float, ptr addrspace(1) %58, align 4, !invariant.load !142 + %60 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %56 + %61 = load float, ptr addrspace(1) %60, align 4, !invariant.load !142 + %divide.6.1 = fdiv float %59, %61 + %multiply.10.1 = fmul float %divide.6.1, %48 + %62 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 4 + %63 = load i8, ptr addrspace(1) %62, align 4, !invariant.load !142 + %64 = sitofp i8 %63 to float + %multiply.18.1 = fmul float %64, 0x3FC3BF2820000000 + %multiply.53.1 = fmul float %multiply.10.1, %multiply.18.1 + %add.57.i.1 = fadd float %add.57.i, %multiply.53.1 + %.lhs.trunc1053.2 = add nuw nsw i16 %.lhs.trunc1053, 2 + %65 = udiv i16 %.lhs.trunc1053.2, 3 + %66 = mul i16 %65, 3 + %.decomposed1091 = sub i16 %.lhs.trunc1053.2, %66 + %67 = zext i16 %65 to i64 + %68 = zext i16 %.decomposed1091 to i64 + %69 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %67, i64 %68 + %70 = load float, ptr addrspace(1) %69, align 4, !invariant.load !142 + %71 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %67 + %72 = load float, ptr addrspace(1) %71, align 4, !invariant.load !142 + %divide.6.2 = fdiv float %70, %72 + %multiply.10.2 = fmul float %divide.6.2, %48 + %73 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 8 + %74 = load i8, ptr addrspace(1) %73, align 4, !invariant.load !142 + %75 = sitofp i8 %74 to float + %multiply.18.2 = fmul float %75, 0x3FC3BF2820000000 + %multiply.53.2 = fmul float %multiply.10.2, %multiply.18.2 + %add.57.i.2 = fadd float %add.57.i.1, %multiply.53.2 + %76 = getelementptr inbounds float, ptr addrspace(1) %47, i64 1 + %77 = load float, ptr addrspace(1) %76, align 4, !invariant.load !142 + %multiply.10.3 = fmul float %divide.6, %77 + %78 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 512 + %79 = load i8, ptr addrspace(1) %78, align 4, !invariant.load !142 + %80 = sitofp i8 %79 to float + %multiply.18.3 = fmul float %80, 0x3FC3BF2820000000 + %multiply.53.3 = fmul float %multiply.10.3, %multiply.18.3 + %add.57.i.3 = fadd float %add.57.i.2, %multiply.53.3 + %multiply.10.4 = fmul float %divide.6.1, %77 + %81 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 516 + %82 = load i8, ptr addrspace(1) %81, align 4, !invariant.load !142 + %83 = sitofp i8 %82 to float + %multiply.18.4 = fmul float %83, 0x3FC3BF2820000000 + %multiply.53.4 = fmul float %multiply.10.4, %multiply.18.4 + %add.57.i.4 = fadd float %add.57.i.3, %multiply.53.4 + %multiply.10.5 = fmul float %divide.6.2, %77 + %84 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 520 + %85 = load i8, ptr addrspace(1) %84, align 4, !invariant.load !142 + %86 = sitofp i8 %85 to float + %multiply.18.5 = fmul float %86, 0x3FC3BF2820000000 + %multiply.53.5 = fmul float %multiply.10.5, %multiply.18.5 + %add.57.i.5 = fadd float %add.57.i.4, %multiply.53.5 + %87 = getelementptr inbounds float, ptr addrspace(1) %47, i64 2 + %88 = load float, ptr addrspace(1) %87, align 4, !invariant.load !142 + %multiply.10.6 = fmul float %divide.6, %88 + %89 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1024 + %90 = load i8, ptr addrspace(1) %89, align 4, !invariant.load !142 + %91 = sitofp i8 %90 to float + %multiply.18.6 = fmul float %91, 0x3FC3BF2820000000 + %multiply.53.6 = fmul float %multiply.10.6, %multiply.18.6 + %add.57.i.6 = fadd float %add.57.i.5, %multiply.53.6 + %multiply.10.7 = fmul float %divide.6.1, %88 + %92 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1028 + %93 = load i8, ptr addrspace(1) %92, align 4, !invariant.load !142 + %94 = sitofp i8 %93 to float + %multiply.18.7 = fmul float %94, 0x3FC3BF2820000000 + %multiply.53.7 = fmul float %multiply.10.7, %multiply.18.7 + %add.57.i.7 = fadd float %add.57.i.6, %multiply.53.7 + %multiply.10.8 = fmul float %divide.6.2, %88 + %95 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1032 + %96 = load i8, ptr addrspace(1) %95, align 4, !invariant.load !142 + %97 = sitofp i8 %96 to float + %multiply.18.8 = fmul float %97, 0x3FC3BF2820000000 + %multiply.53.8 = fmul float %multiply.10.8, %multiply.18.8 + %add.57.i.8 = fadd float %add.57.i.7, %multiply.53.8 + %98 = fptrunc float %add.57.i.8 to half + %99 = zext i32 %linear_index_base to i64 + %100 = getelementptr half, ptr addrspace(1) %arg41104, i64 %99 + store half %98, ptr addrspace(1) %100, align 32 + %101 = udiv i32 %3, 222 + %102 = mul i32 %101, 222 + %.decomposed1092 = sub i32 %3, %102 + %103 = mul nuw nsw i32 %101, 3 + %104 = mul nuw nsw i32 %.decomposed1092, 3 + %105 = uitofp i32 %101 to float + %add.2637 = fadd float %105, -1.000000e+00 + %106 = tail call float @llvm.ceil.f32(float %add.2637) + %107 = fcmp ole float %106, 0.000000e+00 + %108 = select i1 %107, float 0.000000e+00, float %106 + %109 = fcmp oge float %108, 2.493000e+03 + %110 = select i1 %109, float 2.493000e+03, float %108 + %.inv824 = fcmp ole float %110, 0xC1E0000000000000 + %111 = select i1 %.inv824, float 0xC1E0000000000000, float %110 + %112 = fptosi float %111 to i32 + %113 = fcmp oge float %110, 0x41E0000000000000 + %114 = tail call i32 @llvm.smax.i32(i32 %112, i32 0) + %115 = tail call i32 @llvm.umin.i32(i32 %114, i32 2493) + %116 = select i1 %113, i32 2493, i32 %115 + %117 = uitofp i32 %.decomposed1092 to float + %add.3660 = fadd float %117, 5.000000e-01 + %multiply.3962 = fmul float %add.3660, 0x3FE27350C0000000 + %add.4264 = fadd float %multiply.3962, -1.500000e+00 + %118 = tail call float @llvm.ceil.f32(float %add.4264) + %119 = fcmp ole float %118, 0.000000e+00 + %120 = select i1 %119, float 0.000000e+00, float %118 + %121 = fcmp oge float %120, 1.250000e+02 + %122 = select i1 %121, float 1.250000e+02, float %120 + %.inv827 = fcmp ole float %122, 0xC1E0000000000000 + %123 = select i1 %.inv827, float 0xC1E0000000000000, float %122 + %124 = fptosi float %123 to i32 + %125 = fcmp oge float %122, 0x41E0000000000000 + %126 = tail call i32 @llvm.smax.i32(i32 %124, i32 0) + %127 = fcmp uno float %122, 0.000000e+00 + %128 = tail call i32 @llvm.umin.i32(i32 %126, i32 125) + %129 = select i1 %125, i32 125, i32 %128 + %130 = select i1 %127, i32 0, i32 %129 + %.lhs.trunc1045 = trunc i32 %104 to i16 + %131 = udiv i16 %.lhs.trunc1045, 3 + %132 = mul i16 %131, 3 + %.decomposed1093 = sub i16 %.lhs.trunc1045, %132 + %133 = zext i16 %131 to i64 + %134 = zext i16 %.decomposed1093 to i64 + %135 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %133, i64 %134 + %136 = load float, ptr addrspace(1) %135, align 4, !invariant.load !142 + %137 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %133 + %138 = load float, ptr addrspace(1) %137, align 4, !invariant.load !142 + %divide.631 = fdiv float %136, %138 + %139 = zext i32 %103 to i64 + %140 = getelementptr inbounds [7488 x float], ptr addrspace(1) %arg11098, i64 0, i64 %139 + %141 = load float, ptr addrspace(1) %140, align 4, !invariant.load !142 + %multiply.1032 = fmul float %divide.631, %141 + %142 = zext i32 %116 to i64 + %143 = zext i32 %130 to i64 + %144 = getelementptr [1 x [4 x [2496 x [128 x [4 x i8]]]]], ptr addrspace(1) %arg01096, i64 0, i64 0, i64 0, i64 %142, i64 %143, i64 0 + %145 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1 + %146 = load i8, ptr addrspace(1) %145, align 1, !invariant.load !142 + %147 = sitofp i8 %146 to float + %multiply.1870 = fmul float %147, 0x3FC3BF2820000000 + %multiply.5371 = fmul float %multiply.1032, %multiply.1870 + %add.57.i914 = fadd float %multiply.5371, 0.000000e+00 + %.lhs.trunc1045.1 = add nuw nsw i16 %.lhs.trunc1045, 1 + %148 = udiv i16 %.lhs.trunc1045.1, 3 + %149 = mul i16 %148, 3 + %.decomposed1094 = sub i16 %.lhs.trunc1045.1, %149 + %150 = zext i16 %148 to i64 + %151 = zext i16 %.decomposed1094 to i64 + %152 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %150, i64 %151 + %153 = load float, ptr addrspace(1) %152, align 4, !invariant.load !142 + %154 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %150 + %155 = load float, ptr addrspace(1) %154, align 4, !invariant.load !142 + %divide.631.1 = fdiv float %153, %155 + %multiply.1032.1 = fmul float %divide.631.1, %141 + %156 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 5 + %157 = load i8, ptr addrspace(1) %156, align 1, !invariant.load !142 + %158 = sitofp i8 %157 to float + %multiply.1870.1 = fmul float %158, 0x3FC3BF2820000000 + %multiply.5371.1 = fmul float %multiply.1032.1, %multiply.1870.1 + %add.57.i914.1 = fadd float %add.57.i914, %multiply.5371.1 + %.lhs.trunc1045.2 = add nuw nsw i16 %.lhs.trunc1045, 2 + %159 = udiv i16 %.lhs.trunc1045.2, 3 + %160 = mul i16 %159, 3 + %.decomposed1095 = sub i16 %.lhs.trunc1045.2, %160 + %161 = zext i16 %159 to i64 + %162 = zext i16 %.decomposed1095 to i64 + %163 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %161, i64 %162 + %164 = load float, ptr addrspace(1) %163, align 4, !invariant.load !142 + %165 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %161 + %166 = load float, ptr addrspace(1) %165, align 4, !invariant.load !142 + %divide.631.2 = fdiv float %164, %166 + %multiply.1032.2 = fmul float %divide.631.2, %141 + %167 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 9 + %168 = load i8, ptr addrspace(1) %167, align 1, !invariant.load !142 + %169 = sitofp i8 %168 to float + %multiply.1870.2 = fmul float %169, 0x3FC3BF2820000000 + %multiply.5371.2 = fmul float %multiply.1032.2, %multiply.1870.2 + %add.57.i914.2 = fadd float %add.57.i914.1, %multiply.5371.2 + %170 = getelementptr inbounds float, ptr addrspace(1) %140, i64 1 + %171 = load float, ptr addrspace(1) %170, align 4, !invariant.load !142 + %multiply.1032.3 = fmul float %divide.631, %171 + %172 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 513 + %173 = load i8, ptr addrspace(1) %172, align 1, !invariant.load !142 + %174 = sitofp i8 %173 to float + %multiply.1870.3 = fmul float %174, 0x3FC3BF2820000000 + %multiply.5371.3 = fmul float %multiply.1032.3, %multiply.1870.3 + %add.57.i914.3 = fadd float %add.57.i914.2, %multiply.5371.3 + %multiply.1032.4 = fmul float %divide.631.1, %171 + %175 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 517 + %176 = load i8, ptr addrspace(1) %175, align 1, !invariant.load !142 + %177 = sitofp i8 %176 to float + %multiply.1870.4 = fmul float %177, 0x3FC3BF2820000000 + %multiply.5371.4 = fmul float %multiply.1032.4, %multiply.1870.4 + %add.57.i914.4 = fadd float %add.57.i914.3, %multiply.5371.4 + %multiply.1032.5 = fmul float %divide.631.2, %171 + %178 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 521 + %179 = load i8, ptr addrspace(1) %178, align 1, !invariant.load !142 + %180 = sitofp i8 %179 to float + %multiply.1870.5 = fmul float %180, 0x3FC3BF2820000000 + %multiply.5371.5 = fmul float %multiply.1032.5, %multiply.1870.5 + %add.57.i914.5 = fadd float %add.57.i914.4, %multiply.5371.5 + %181 = getelementptr inbounds float, ptr addrspace(1) %140, i64 2 + %182 = load float, ptr addrspace(1) %181, align 4, !invariant.load !142 + %multiply.1032.6 = fmul float %divide.631, %182 + %183 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1025 + %184 = load i8, ptr addrspace(1) %183, align 1, !invariant.load !142 + %185 = sitofp i8 %184 to float + %multiply.1870.6 = fmul float %185, 0x3FC3BF2820000000 + %multiply.5371.6 = fmul float %multiply.1032.6, %multiply.1870.6 + %add.57.i914.6 = fadd float %add.57.i914.5, %multiply.5371.6 + %multiply.1032.7 = fmul float %divide.631.1, %182 + %186 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1029 + %187 = load i8, ptr addrspace(1) %186, align 1, !invariant.load !142 + %188 = sitofp i8 %187 to float + %multiply.1870.7 = fmul float %188, 0x3FC3BF2820000000 + %multiply.5371.7 = fmul float %multiply.1032.7, %multiply.1870.7 + %add.57.i914.7 = fadd float %add.57.i914.6, %multiply.5371.7 + %multiply.1032.8 = fmul float %divide.631.2, %182 + %189 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1033 + %190 = load i8, ptr addrspace(1) %189, align 1, !invariant.load !142 + %191 = sitofp i8 %190 to float + %multiply.1870.8 = fmul float %191, 0x3FC3BF2820000000 + %multiply.5371.8 = fmul float %multiply.1032.8, %multiply.1870.8 + %add.57.i914.8 = fadd float %add.57.i914.7, %multiply.5371.8 + %192 = fptrunc float %add.57.i914.8 to half + %193 = getelementptr inbounds half, ptr addrspace(1) %100, i64 1 + store half %192, ptr addrspace(1) %193, align 2 + %194 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2 + %195 = load i8, ptr addrspace(1) %194, align 2, !invariant.load !142 + %196 = sitofp i8 %195 to float + %multiply.18122 = fmul float %196, 0x3FC3BF2820000000 + %multiply.53123 = fmul float %multiply.1032, %multiply.18122 + %add.57.i915 = fadd float %multiply.53123, 0.000000e+00 + %197 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 6 + %198 = load i8, ptr addrspace(1) %197, align 2, !invariant.load !142 + %199 = sitofp i8 %198 to float + %multiply.18122.1 = fmul float %199, 0x3FC3BF2820000000 + %multiply.53123.1 = fmul float %multiply.1032.1, %multiply.18122.1 + %add.57.i915.1 = fadd float %add.57.i915, %multiply.53123.1 + %200 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 10 + %201 = load i8, ptr addrspace(1) %200, align 2, !invariant.load !142 + %202 = sitofp i8 %201 to float + %multiply.18122.2 = fmul float %202, 0x3FC3BF2820000000 + %multiply.53123.2 = fmul float %multiply.1032.2, %multiply.18122.2 + %add.57.i915.2 = fadd float %add.57.i915.1, %multiply.53123.2 + %203 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 514 + %204 = load i8, ptr addrspace(1) %203, align 2, !invariant.load !142 + %205 = sitofp i8 %204 to float + %multiply.18122.3 = fmul float %205, 0x3FC3BF2820000000 + %multiply.53123.3 = fmul float %multiply.1032.3, %multiply.18122.3 + %add.57.i915.3 = fadd float %add.57.i915.2, %multiply.53123.3 + %206 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 518 + %207 = load i8, ptr addrspace(1) %206, align 2, !invariant.load !142 + %208 = sitofp i8 %207 to float + %multiply.18122.4 = fmul float %208, 0x3FC3BF2820000000 + %multiply.53123.4 = fmul float %multiply.1032.4, %multiply.18122.4 + %add.57.i915.4 = fadd float %add.57.i915.3, %multiply.53123.4 + %209 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 522 + %210 = load i8, ptr addrspace(1) %209, align 2, !invariant.load !142 + %211 = sitofp i8 %210 to float + %multiply.18122.5 = fmul float %211, 0x3FC3BF2820000000 + %multiply.53123.5 = fmul float %multiply.1032.5, %multiply.18122.5 + %add.57.i915.5 = fadd float %add.57.i915.4, %multiply.53123.5 + %212 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1026 + %213 = load i8, ptr addrspace(1) %212, align 2, !invariant.load !142 + %214 = sitofp i8 %213 to float + %multiply.18122.6 = fmul float %214, 0x3FC3BF2820000000 + %multiply.53123.6 = fmul float %multiply.1032.6, %multiply.18122.6 + %add.57.i915.6 = fadd float %add.57.i915.5, %multiply.53123.6 + %215 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1030 + %216 = load i8, ptr addrspace(1) %215, align 2, !invariant.load !142 + %217 = sitofp i8 %216 to float + %multiply.18122.7 = fmul float %217, 0x3FC3BF2820000000 + %multiply.53123.7 = fmul float %multiply.1032.7, %multiply.18122.7 + %add.57.i915.7 = fadd float %add.57.i915.6, %multiply.53123.7 + %218 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1034 + %219 = load i8, ptr addrspace(1) %218, align 2, !invariant.load !142 + %220 = sitofp i8 %219 to float + %multiply.18122.8 = fmul float %220, 0x3FC3BF2820000000 + %multiply.53123.8 = fmul float %multiply.1032.8, %multiply.18122.8 + %add.57.i915.8 = fadd float %add.57.i915.7, %multiply.53123.8 + %221 = fptrunc float %add.57.i915.8 to half + %222 = getelementptr inbounds half, ptr addrspace(1) %100, i64 2 + store half %221, ptr addrspace(1) %222, align 4 + %223 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3 + %224 = load i8, ptr addrspace(1) %223, align 1, !invariant.load !142 + %225 = sitofp i8 %224 to float + %multiply.18174 = fmul float %225, 0x3FC3BF2820000000 + %multiply.53175 = fmul float %multiply.1032, %multiply.18174 + %add.57.i916 = fadd float %multiply.53175, 0.000000e+00 + %226 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 7 + %227 = load i8, ptr addrspace(1) %226, align 1, !invariant.load !142 + %228 = sitofp i8 %227 to float + %multiply.18174.1 = fmul float %228, 0x3FC3BF2820000000 + %multiply.53175.1 = fmul float %multiply.1032.1, %multiply.18174.1 + %add.57.i916.1 = fadd float %add.57.i916, %multiply.53175.1 + %229 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 11 + %230 = load i8, ptr addrspace(1) %229, align 1, !invariant.load !142 + %231 = sitofp i8 %230 to float + %multiply.18174.2 = fmul float %231, 0x3FC3BF2820000000 + %multiply.53175.2 = fmul float %multiply.1032.2, %multiply.18174.2 + %add.57.i916.2 = fadd float %add.57.i916.1, %multiply.53175.2 + %232 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 515 + %233 = load i8, ptr addrspace(1) %232, align 1, !invariant.load !142 + %234 = sitofp i8 %233 to float + %multiply.18174.3 = fmul float %234, 0x3FC3BF2820000000 + %multiply.53175.3 = fmul float %multiply.1032.3, %multiply.18174.3 + %add.57.i916.3 = fadd float %add.57.i916.2, %multiply.53175.3 + %235 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 519 + %236 = load i8, ptr addrspace(1) %235, align 1, !invariant.load !142 + %237 = sitofp i8 %236 to float + %multiply.18174.4 = fmul float %237, 0x3FC3BF2820000000 + %multiply.53175.4 = fmul float %multiply.1032.4, %multiply.18174.4 + %add.57.i916.4 = fadd float %add.57.i916.3, %multiply.53175.4 + %238 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 523 + %239 = load i8, ptr addrspace(1) %238, align 1, !invariant.load !142 + %240 = sitofp i8 %239 to float + %multiply.18174.5 = fmul float %240, 0x3FC3BF2820000000 + %multiply.53175.5 = fmul float %multiply.1032.5, %multiply.18174.5 + %add.57.i916.5 = fadd float %add.57.i916.4, %multiply.53175.5 + %241 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1027 + %242 = load i8, ptr addrspace(1) %241, align 1, !invariant.load !142 + %243 = sitofp i8 %242 to float + %multiply.18174.6 = fmul float %243, 0x3FC3BF2820000000 + %multiply.53175.6 = fmul float %multiply.1032.6, %multiply.18174.6 + %add.57.i916.6 = fadd float %add.57.i916.5, %multiply.53175.6 + %244 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1031 + %245 = load i8, ptr addrspace(1) %244, align 1, !invariant.load !142 + %246 = sitofp i8 %245 to float + %multiply.18174.7 = fmul float %246, 0x3FC3BF2820000000 + %multiply.53175.7 = fmul float %multiply.1032.7, %multiply.18174.7 + %add.57.i916.7 = fadd float %add.57.i916.6, %multiply.53175.7 + %247 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1035 + %248 = load i8, ptr addrspace(1) %247, align 1, !invariant.load !142 + %249 = sitofp i8 %248 to float + %multiply.18174.8 = fmul float %249, 0x3FC3BF2820000000 + %multiply.53175.8 = fmul float %multiply.1032.8, %multiply.18174.8 + %add.57.i916.8 = fadd float %add.57.i916.7, %multiply.53175.8 + %250 = fptrunc float %add.57.i916.8 to half + %251 = getelementptr inbounds half, ptr addrspace(1) %100, i64 3 + store half %250, ptr addrspace(1) %251, align 2 + %252 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277952 + %253 = load i8, ptr addrspace(1) %252, align 4, !invariant.load !142 + %254 = sitofp i8 %253 to float + %multiply.18226 = fmul float %254, 0x3FC3BF2820000000 + %multiply.53227 = fmul float %multiply.1032, %multiply.18226 + %add.57.i917 = fadd float %multiply.53227, 0.000000e+00 + %255 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277956 + %256 = load i8, ptr addrspace(1) %255, align 4, !invariant.load !142 + %257 = sitofp i8 %256 to float + %multiply.18226.1 = fmul float %257, 0x3FC3BF2820000000 + %multiply.53227.1 = fmul float %multiply.1032.1, %multiply.18226.1 + %add.57.i917.1 = fadd float %add.57.i917, %multiply.53227.1 + %258 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277960 + %259 = load i8, ptr addrspace(1) %258, align 4, !invariant.load !142 + %260 = sitofp i8 %259 to float + %multiply.18226.2 = fmul float %260, 0x3FC3BF2820000000 + %multiply.53227.2 = fmul float %multiply.1032.2, %multiply.18226.2 + %add.57.i917.2 = fadd float %add.57.i917.1, %multiply.53227.2 + %261 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278464 + %262 = load i8, ptr addrspace(1) %261, align 4, !invariant.load !142 + %263 = sitofp i8 %262 to float + %multiply.18226.3 = fmul float %263, 0x3FC3BF2820000000 + %multiply.53227.3 = fmul float %multiply.1032.3, %multiply.18226.3 + %add.57.i917.3 = fadd float %add.57.i917.2, %multiply.53227.3 + %264 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278468 + %265 = load i8, ptr addrspace(1) %264, align 4, !invariant.load !142 + %266 = sitofp i8 %265 to float + %multiply.18226.4 = fmul float %266, 0x3FC3BF2820000000 + %multiply.53227.4 = fmul float %multiply.1032.4, %multiply.18226.4 + %add.57.i917.4 = fadd float %add.57.i917.3, %multiply.53227.4 + %267 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278472 + %268 = load i8, ptr addrspace(1) %267, align 4, !invariant.load !142 + %269 = sitofp i8 %268 to float + %multiply.18226.5 = fmul float %269, 0x3FC3BF2820000000 + %multiply.53227.5 = fmul float %multiply.1032.5, %multiply.18226.5 + %add.57.i917.5 = fadd float %add.57.i917.4, %multiply.53227.5 + %270 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278976 + %271 = load i8, ptr addrspace(1) %270, align 4, !invariant.load !142 + %272 = sitofp i8 %271 to float + %multiply.18226.6 = fmul float %272, 0x3FC3BF2820000000 + %multiply.53227.6 = fmul float %multiply.1032.6, %multiply.18226.6 + %add.57.i917.6 = fadd float %add.57.i917.5, %multiply.53227.6 + %273 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278980 + %274 = load i8, ptr addrspace(1) %273, align 4, !invariant.load !142 + %275 = sitofp i8 %274 to float + %multiply.18226.7 = fmul float %275, 0x3FC3BF2820000000 + %multiply.53227.7 = fmul float %multiply.1032.7, %multiply.18226.7 + %add.57.i917.7 = fadd float %add.57.i917.6, %multiply.53227.7 + %276 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278984 + %277 = load i8, ptr addrspace(1) %276, align 4, !invariant.load !142 + %278 = sitofp i8 %277 to float + %multiply.18226.8 = fmul float %278, 0x3FC3BF2820000000 + %multiply.53227.8 = fmul float %multiply.1032.8, %multiply.18226.8 + %add.57.i917.8 = fadd float %add.57.i917.7, %multiply.53227.8 + %279 = fptrunc float %add.57.i917.8 to half + %280 = getelementptr inbounds half, ptr addrspace(1) %100, i64 4 + store half %279, ptr addrspace(1) %280, align 8 + %281 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277953 + %282 = load i8, ptr addrspace(1) %281, align 1, !invariant.load !142 + %283 = sitofp i8 %282 to float + %multiply.18278 = fmul float %283, 0x3FC3BF2820000000 + %multiply.53279 = fmul float %multiply.1032, %multiply.18278 + %add.57.i918 = fadd float %multiply.53279, 0.000000e+00 + %284 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277957 + %285 = load i8, ptr addrspace(1) %284, align 1, !invariant.load !142 + %286 = sitofp i8 %285 to float + %multiply.18278.1 = fmul float %286, 0x3FC3BF2820000000 + %multiply.53279.1 = fmul float %multiply.1032.1, %multiply.18278.1 + %add.57.i918.1 = fadd float %add.57.i918, %multiply.53279.1 + %287 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277961 + %288 = load i8, ptr addrspace(1) %287, align 1, !invariant.load !142 + %289 = sitofp i8 %288 to float + %multiply.18278.2 = fmul float %289, 0x3FC3BF2820000000 + %multiply.53279.2 = fmul float %multiply.1032.2, %multiply.18278.2 + %add.57.i918.2 = fadd float %add.57.i918.1, %multiply.53279.2 + %290 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278465 + %291 = load i8, ptr addrspace(1) %290, align 1, !invariant.load !142 + %292 = sitofp i8 %291 to float + %multiply.18278.3 = fmul float %292, 0x3FC3BF2820000000 + %multiply.53279.3 = fmul float %multiply.1032.3, %multiply.18278.3 + %add.57.i918.3 = fadd float %add.57.i918.2, %multiply.53279.3 + %293 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278469 + %294 = load i8, ptr addrspace(1) %293, align 1, !invariant.load !142 + %295 = sitofp i8 %294 to float + %multiply.18278.4 = fmul float %295, 0x3FC3BF2820000000 + %multiply.53279.4 = fmul float %multiply.1032.4, %multiply.18278.4 + %add.57.i918.4 = fadd float %add.57.i918.3, %multiply.53279.4 + %296 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278473 + %297 = load i8, ptr addrspace(1) %296, align 1, !invariant.load !142 + %298 = sitofp i8 %297 to float + %multiply.18278.5 = fmul float %298, 0x3FC3BF2820000000 + %multiply.53279.5 = fmul float %multiply.1032.5, %multiply.18278.5 + %add.57.i918.5 = fadd float %add.57.i918.4, %multiply.53279.5 + %299 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278977 + %300 = load i8, ptr addrspace(1) %299, align 1, !invariant.load !142 + %301 = sitofp i8 %300 to float + %multiply.18278.6 = fmul float %301, 0x3FC3BF2820000000 + %multiply.53279.6 = fmul float %multiply.1032.6, %multiply.18278.6 + %add.57.i918.6 = fadd float %add.57.i918.5, %multiply.53279.6 + %302 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278981 + %303 = load i8, ptr addrspace(1) %302, align 1, !invariant.load !142 + %304 = sitofp i8 %303 to float + %multiply.18278.7 = fmul float %304, 0x3FC3BF2820000000 + %multiply.53279.7 = fmul float %multiply.1032.7, %multiply.18278.7 + %add.57.i918.7 = fadd float %add.57.i918.6, %multiply.53279.7 + %305 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278985 + %306 = load i8, ptr addrspace(1) %305, align 1, !invariant.load !142 + %307 = sitofp i8 %306 to float + %multiply.18278.8 = fmul float %307, 0x3FC3BF2820000000 + %multiply.53279.8 = fmul float %multiply.1032.8, %multiply.18278.8 + %add.57.i918.8 = fadd float %add.57.i918.7, %multiply.53279.8 + %308 = fptrunc float %add.57.i918.8 to half + %309 = getelementptr inbounds half, ptr addrspace(1) %100, i64 5 + store half %308, ptr addrspace(1) %309, align 2 + %310 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277954 + %311 = load i8, ptr addrspace(1) %310, align 2, !invariant.load !142 + %312 = sitofp i8 %311 to float + %multiply.18330 = fmul float %312, 0x3FC3BF2820000000 + %multiply.53331 = fmul float %multiply.1032, %multiply.18330 + %add.57.i919 = fadd float %multiply.53331, 0.000000e+00 + %313 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277958 + %314 = load i8, ptr addrspace(1) %313, align 2, !invariant.load !142 + %315 = sitofp i8 %314 to float + %multiply.18330.1 = fmul float %315, 0x3FC3BF2820000000 + %multiply.53331.1 = fmul float %multiply.1032.1, %multiply.18330.1 + %add.57.i919.1 = fadd float %add.57.i919, %multiply.53331.1 + %316 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277962 + %317 = load i8, ptr addrspace(1) %316, align 2, !invariant.load !142 + %318 = sitofp i8 %317 to float + %multiply.18330.2 = fmul float %318, 0x3FC3BF2820000000 + %multiply.53331.2 = fmul float %multiply.1032.2, %multiply.18330.2 + %add.57.i919.2 = fadd float %add.57.i919.1, %multiply.53331.2 + %319 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278466 + %320 = load i8, ptr addrspace(1) %319, align 2, !invariant.load !142 + %321 = sitofp i8 %320 to float + %multiply.18330.3 = fmul float %321, 0x3FC3BF2820000000 + %multiply.53331.3 = fmul float %multiply.1032.3, %multiply.18330.3 + %add.57.i919.3 = fadd float %add.57.i919.2, %multiply.53331.3 + %322 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278470 + %323 = load i8, ptr addrspace(1) %322, align 2, !invariant.load !142 + %324 = sitofp i8 %323 to float + %multiply.18330.4 = fmul float %324, 0x3FC3BF2820000000 + %multiply.53331.4 = fmul float %multiply.1032.4, %multiply.18330.4 + %add.57.i919.4 = fadd float %add.57.i919.3, %multiply.53331.4 + %325 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278474 + %326 = load i8, ptr addrspace(1) %325, align 2, !invariant.load !142 + %327 = sitofp i8 %326 to float + %multiply.18330.5 = fmul float %327, 0x3FC3BF2820000000 + %multiply.53331.5 = fmul float %multiply.1032.5, %multiply.18330.5 + %add.57.i919.5 = fadd float %add.57.i919.4, %multiply.53331.5 + %328 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278978 + %329 = load i8, ptr addrspace(1) %328, align 2, !invariant.load !142 + %330 = sitofp i8 %329 to float + %multiply.18330.6 = fmul float %330, 0x3FC3BF2820000000 + %multiply.53331.6 = fmul float %multiply.1032.6, %multiply.18330.6 + %add.57.i919.6 = fadd float %add.57.i919.5, %multiply.53331.6 + %331 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278982 + %332 = load i8, ptr addrspace(1) %331, align 2, !invariant.load !142 + %333 = sitofp i8 %332 to float + %multiply.18330.7 = fmul float %333, 0x3FC3BF2820000000 + %multiply.53331.7 = fmul float %multiply.1032.7, %multiply.18330.7 + %add.57.i919.7 = fadd float %add.57.i919.6, %multiply.53331.7 + %334 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278986 + %335 = load i8, ptr addrspace(1) %334, align 2, !invariant.load !142 + %336 = sitofp i8 %335 to float + %multiply.18330.8 = fmul float %336, 0x3FC3BF2820000000 + %multiply.53331.8 = fmul float %multiply.1032.8, %multiply.18330.8 + %add.57.i919.8 = fadd float %add.57.i919.7, %multiply.53331.8 + %337 = fptrunc float %add.57.i919.8 to half + %338 = getelementptr inbounds half, ptr addrspace(1) %100, i64 6 + store half %337, ptr addrspace(1) %338, align 4 + %339 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277955 + %340 = load i8, ptr addrspace(1) %339, align 1, !invariant.load !142 + %341 = sitofp i8 %340 to float + %multiply.18382 = fmul float %341, 0x3FC3BF2820000000 + %multiply.53383 = fmul float %multiply.1032, %multiply.18382 + %add.57.i920 = fadd float %multiply.53383, 0.000000e+00 + %342 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277959 + %343 = load i8, ptr addrspace(1) %342, align 1, !invariant.load !142 + %344 = sitofp i8 %343 to float + %multiply.18382.1 = fmul float %344, 0x3FC3BF2820000000 + %multiply.53383.1 = fmul float %multiply.1032.1, %multiply.18382.1 + %add.57.i920.1 = fadd float %add.57.i920, %multiply.53383.1 + %345 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277963 + %346 = load i8, ptr addrspace(1) %345, align 1, !invariant.load !142 + %347 = sitofp i8 %346 to float + %multiply.18382.2 = fmul float %347, 0x3FC3BF2820000000 + %multiply.53383.2 = fmul float %multiply.1032.2, %multiply.18382.2 + %add.57.i920.2 = fadd float %add.57.i920.1, %multiply.53383.2 + %348 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278467 + %349 = load i8, ptr addrspace(1) %348, align 1, !invariant.load !142 + %350 = sitofp i8 %349 to float + %multiply.18382.3 = fmul float %350, 0x3FC3BF2820000000 + %multiply.53383.3 = fmul float %multiply.1032.3, %multiply.18382.3 + %add.57.i920.3 = fadd float %add.57.i920.2, %multiply.53383.3 + %351 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278471 + %352 = load i8, ptr addrspace(1) %351, align 1, !invariant.load !142 + %353 = sitofp i8 %352 to float + %multiply.18382.4 = fmul float %353, 0x3FC3BF2820000000 + %multiply.53383.4 = fmul float %multiply.1032.4, %multiply.18382.4 + %add.57.i920.4 = fadd float %add.57.i920.3, %multiply.53383.4 + %354 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278475 + %355 = load i8, ptr addrspace(1) %354, align 1, !invariant.load !142 + %356 = sitofp i8 %355 to float + %multiply.18382.5 = fmul float %356, 0x3FC3BF2820000000 + %multiply.53383.5 = fmul float %multiply.1032.5, %multiply.18382.5 + %add.57.i920.5 = fadd float %add.57.i920.4, %multiply.53383.5 + %357 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278979 + %358 = load i8, ptr addrspace(1) %357, align 1, !invariant.load !142 + %359 = sitofp i8 %358 to float + %multiply.18382.6 = fmul float %359, 0x3FC3BF2820000000 + %multiply.53383.6 = fmul float %multiply.1032.6, %multiply.18382.6 + %add.57.i920.6 = fadd float %add.57.i920.5, %multiply.53383.6 + %360 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278983 + %361 = load i8, ptr addrspace(1) %360, align 1, !invariant.load !142 + %362 = sitofp i8 %361 to float + %multiply.18382.7 = fmul float %362, 0x3FC3BF2820000000 + %multiply.53383.7 = fmul float %multiply.1032.7, %multiply.18382.7 + %add.57.i920.7 = fadd float %add.57.i920.6, %multiply.53383.7 + %363 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278987 + %364 = load i8, ptr addrspace(1) %363, align 1, !invariant.load !142 + %365 = sitofp i8 %364 to float + %multiply.18382.8 = fmul float %365, 0x3FC3BF2820000000 + %multiply.53383.8 = fmul float %multiply.1032.8, %multiply.18382.8 + %add.57.i920.8 = fadd float %add.57.i920.7, %multiply.53383.8 + %366 = fptrunc float %add.57.i920.8 to half + %367 = getelementptr inbounds half, ptr addrspace(1) %100, i64 7 + store half %366, ptr addrspace(1) %367, align 2 + %368 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555904 + %369 = load i8, ptr addrspace(1) %368, align 4, !invariant.load !142 + %370 = sitofp i8 %369 to float + %multiply.18434 = fmul float %370, 0x3FC3BF2820000000 + %multiply.53435 = fmul float %multiply.1032, %multiply.18434 + %add.57.i921 = fadd float %multiply.53435, 0.000000e+00 + %371 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555908 + %372 = load i8, ptr addrspace(1) %371, align 4, !invariant.load !142 + %373 = sitofp i8 %372 to float + %multiply.18434.1 = fmul float %373, 0x3FC3BF2820000000 + %multiply.53435.1 = fmul float %multiply.1032.1, %multiply.18434.1 + %add.57.i921.1 = fadd float %add.57.i921, %multiply.53435.1 + %374 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555912 + %375 = load i8, ptr addrspace(1) %374, align 4, !invariant.load !142 + %376 = sitofp i8 %375 to float + %multiply.18434.2 = fmul float %376, 0x3FC3BF2820000000 + %multiply.53435.2 = fmul float %multiply.1032.2, %multiply.18434.2 + %add.57.i921.2 = fadd float %add.57.i921.1, %multiply.53435.2 + %377 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556416 + %378 = load i8, ptr addrspace(1) %377, align 4, !invariant.load !142 + %379 = sitofp i8 %378 to float + %multiply.18434.3 = fmul float %379, 0x3FC3BF2820000000 + %multiply.53435.3 = fmul float %multiply.1032.3, %multiply.18434.3 + %add.57.i921.3 = fadd float %add.57.i921.2, %multiply.53435.3 + %380 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556420 + %381 = load i8, ptr addrspace(1) %380, align 4, !invariant.load !142 + %382 = sitofp i8 %381 to float + %multiply.18434.4 = fmul float %382, 0x3FC3BF2820000000 + %multiply.53435.4 = fmul float %multiply.1032.4, %multiply.18434.4 + %add.57.i921.4 = fadd float %add.57.i921.3, %multiply.53435.4 + %383 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556424 + %384 = load i8, ptr addrspace(1) %383, align 4, !invariant.load !142 + %385 = sitofp i8 %384 to float + %multiply.18434.5 = fmul float %385, 0x3FC3BF2820000000 + %multiply.53435.5 = fmul float %multiply.1032.5, %multiply.18434.5 + %add.57.i921.5 = fadd float %add.57.i921.4, %multiply.53435.5 + %386 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556928 + %387 = load i8, ptr addrspace(1) %386, align 4, !invariant.load !142 + %388 = sitofp i8 %387 to float + %multiply.18434.6 = fmul float %388, 0x3FC3BF2820000000 + %multiply.53435.6 = fmul float %multiply.1032.6, %multiply.18434.6 + %add.57.i921.6 = fadd float %add.57.i921.5, %multiply.53435.6 + %389 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556932 + %390 = load i8, ptr addrspace(1) %389, align 4, !invariant.load !142 + %391 = sitofp i8 %390 to float + %multiply.18434.7 = fmul float %391, 0x3FC3BF2820000000 + %multiply.53435.7 = fmul float %multiply.1032.7, %multiply.18434.7 + %add.57.i921.7 = fadd float %add.57.i921.6, %multiply.53435.7 + %392 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556936 + %393 = load i8, ptr addrspace(1) %392, align 4, !invariant.load !142 + %394 = sitofp i8 %393 to float + %multiply.18434.8 = fmul float %394, 0x3FC3BF2820000000 + %multiply.53435.8 = fmul float %multiply.1032.8, %multiply.18434.8 + %add.57.i921.8 = fadd float %add.57.i921.7, %multiply.53435.8 + %395 = fptrunc float %add.57.i921.8 to half + %396 = getelementptr inbounds half, ptr addrspace(1) %100, i64 8 + store half %395, ptr addrspace(1) %396, align 16 + %397 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555905 + %398 = load i8, ptr addrspace(1) %397, align 1, !invariant.load !142 + %399 = sitofp i8 %398 to float + %multiply.18486 = fmul float %399, 0x3FC3BF2820000000 + %multiply.53487 = fmul float %multiply.1032, %multiply.18486 + %add.57.i922 = fadd float %multiply.53487, 0.000000e+00 + %400 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555909 + %401 = load i8, ptr addrspace(1) %400, align 1, !invariant.load !142 + %402 = sitofp i8 %401 to float + %multiply.18486.1 = fmul float %402, 0x3FC3BF2820000000 + %multiply.53487.1 = fmul float %multiply.1032.1, %multiply.18486.1 + %add.57.i922.1 = fadd float %add.57.i922, %multiply.53487.1 + %403 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555913 + %404 = load i8, ptr addrspace(1) %403, align 1, !invariant.load !142 + %405 = sitofp i8 %404 to float + %multiply.18486.2 = fmul float %405, 0x3FC3BF2820000000 + %multiply.53487.2 = fmul float %multiply.1032.2, %multiply.18486.2 + %add.57.i922.2 = fadd float %add.57.i922.1, %multiply.53487.2 + %406 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556417 + %407 = load i8, ptr addrspace(1) %406, align 1, !invariant.load !142 + %408 = sitofp i8 %407 to float + %multiply.18486.3 = fmul float %408, 0x3FC3BF2820000000 + %multiply.53487.3 = fmul float %multiply.1032.3, %multiply.18486.3 + %add.57.i922.3 = fadd float %add.57.i922.2, %multiply.53487.3 + %409 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556421 + %410 = load i8, ptr addrspace(1) %409, align 1, !invariant.load !142 + %411 = sitofp i8 %410 to float + %multiply.18486.4 = fmul float %411, 0x3FC3BF2820000000 + %multiply.53487.4 = fmul float %multiply.1032.4, %multiply.18486.4 + %add.57.i922.4 = fadd float %add.57.i922.3, %multiply.53487.4 + %412 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556425 + %413 = load i8, ptr addrspace(1) %412, align 1, !invariant.load !142 + %414 = sitofp i8 %413 to float + %multiply.18486.5 = fmul float %414, 0x3FC3BF2820000000 + %multiply.53487.5 = fmul float %multiply.1032.5, %multiply.18486.5 + %add.57.i922.5 = fadd float %add.57.i922.4, %multiply.53487.5 + %415 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556929 + %416 = load i8, ptr addrspace(1) %415, align 1, !invariant.load !142 + %417 = sitofp i8 %416 to float + %multiply.18486.6 = fmul float %417, 0x3FC3BF2820000000 + %multiply.53487.6 = fmul float %multiply.1032.6, %multiply.18486.6 + %add.57.i922.6 = fadd float %add.57.i922.5, %multiply.53487.6 + %418 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556933 + %419 = load i8, ptr addrspace(1) %418, align 1, !invariant.load !142 + %420 = sitofp i8 %419 to float + %multiply.18486.7 = fmul float %420, 0x3FC3BF2820000000 + %multiply.53487.7 = fmul float %multiply.1032.7, %multiply.18486.7 + %add.57.i922.7 = fadd float %add.57.i922.6, %multiply.53487.7 + %421 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556937 + %422 = load i8, ptr addrspace(1) %421, align 1, !invariant.load !142 + %423 = sitofp i8 %422 to float + %multiply.18486.8 = fmul float %423, 0x3FC3BF2820000000 + %multiply.53487.8 = fmul float %multiply.1032.8, %multiply.18486.8 + %add.57.i922.8 = fadd float %add.57.i922.7, %multiply.53487.8 + %424 = fptrunc float %add.57.i922.8 to half + %425 = getelementptr inbounds half, ptr addrspace(1) %100, i64 9 + store half %424, ptr addrspace(1) %425, align 2 + %426 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555906 + %427 = load i8, ptr addrspace(1) %426, align 2, !invariant.load !142 + %428 = sitofp i8 %427 to float + %multiply.18538 = fmul float %428, 0x3FC3BF2820000000 + %multiply.53539 = fmul float %multiply.1032, %multiply.18538 + %add.57.i923 = fadd float %multiply.53539, 0.000000e+00 + %429 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555910 + %430 = load i8, ptr addrspace(1) %429, align 2, !invariant.load !142 + %431 = sitofp i8 %430 to float + %multiply.18538.1 = fmul float %431, 0x3FC3BF2820000000 + %multiply.53539.1 = fmul float %multiply.1032.1, %multiply.18538.1 + %add.57.i923.1 = fadd float %add.57.i923, %multiply.53539.1 + %432 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555914 + %433 = load i8, ptr addrspace(1) %432, align 2, !invariant.load !142 + %434 = sitofp i8 %433 to float + %multiply.18538.2 = fmul float %434, 0x3FC3BF2820000000 + %multiply.53539.2 = fmul float %multiply.1032.2, %multiply.18538.2 + %add.57.i923.2 = fadd float %add.57.i923.1, %multiply.53539.2 + %435 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556418 + %436 = load i8, ptr addrspace(1) %435, align 2, !invariant.load !142 + %437 = sitofp i8 %436 to float + %multiply.18538.3 = fmul float %437, 0x3FC3BF2820000000 + %multiply.53539.3 = fmul float %multiply.1032.3, %multiply.18538.3 + %add.57.i923.3 = fadd float %add.57.i923.2, %multiply.53539.3 + %438 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556422 + %439 = load i8, ptr addrspace(1) %438, align 2, !invariant.load !142 + %440 = sitofp i8 %439 to float + %multiply.18538.4 = fmul float %440, 0x3FC3BF2820000000 + %multiply.53539.4 = fmul float %multiply.1032.4, %multiply.18538.4 + %add.57.i923.4 = fadd float %add.57.i923.3, %multiply.53539.4 + %441 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556426 + %442 = load i8, ptr addrspace(1) %441, align 2, !invariant.load !142 + %443 = sitofp i8 %442 to float + %multiply.18538.5 = fmul float %443, 0x3FC3BF2820000000 + %multiply.53539.5 = fmul float %multiply.1032.5, %multiply.18538.5 + %add.57.i923.5 = fadd float %add.57.i923.4, %multiply.53539.5 + %444 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556930 + %445 = load i8, ptr addrspace(1) %444, align 2, !invariant.load !142 + %446 = sitofp i8 %445 to float + %multiply.18538.6 = fmul float %446, 0x3FC3BF2820000000 + %multiply.53539.6 = fmul float %multiply.1032.6, %multiply.18538.6 + %add.57.i923.6 = fadd float %add.57.i923.5, %multiply.53539.6 + %447 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556934 + %448 = load i8, ptr addrspace(1) %447, align 2, !invariant.load !142 + %449 = sitofp i8 %448 to float + %multiply.18538.7 = fmul float %449, 0x3FC3BF2820000000 + %multiply.53539.7 = fmul float %multiply.1032.7, %multiply.18538.7 + %add.57.i923.7 = fadd float %add.57.i923.6, %multiply.53539.7 + %450 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556938 + %451 = load i8, ptr addrspace(1) %450, align 2, !invariant.load !142 + %452 = sitofp i8 %451 to float + %multiply.18538.8 = fmul float %452, 0x3FC3BF2820000000 + %multiply.53539.8 = fmul float %multiply.1032.8, %multiply.18538.8 + %add.57.i923.8 = fadd float %add.57.i923.7, %multiply.53539.8 + %453 = fptrunc float %add.57.i923.8 to half + %454 = getelementptr inbounds half, ptr addrspace(1) %100, i64 10 + store half %453, ptr addrspace(1) %454, align 4 + %455 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555907 + %456 = load i8, ptr addrspace(1) %455, align 1, !invariant.load !142 + %457 = sitofp i8 %456 to float + %multiply.18590 = fmul float %457, 0x3FC3BF2820000000 + %multiply.53591 = fmul float %multiply.1032, %multiply.18590 + %add.57.i924 = fadd float %multiply.53591, 0.000000e+00 + %458 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555911 + %459 = load i8, ptr addrspace(1) %458, align 1, !invariant.load !142 + %460 = sitofp i8 %459 to float + %multiply.18590.1 = fmul float %460, 0x3FC3BF2820000000 + %multiply.53591.1 = fmul float %multiply.1032.1, %multiply.18590.1 + %add.57.i924.1 = fadd float %add.57.i924, %multiply.53591.1 + %461 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555915 + %462 = load i8, ptr addrspace(1) %461, align 1, !invariant.load !142 + %463 = sitofp i8 %462 to float + %multiply.18590.2 = fmul float %463, 0x3FC3BF2820000000 + %multiply.53591.2 = fmul float %multiply.1032.2, %multiply.18590.2 + %add.57.i924.2 = fadd float %add.57.i924.1, %multiply.53591.2 + %464 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556419 + %465 = load i8, ptr addrspace(1) %464, align 1, !invariant.load !142 + %466 = sitofp i8 %465 to float + %multiply.18590.3 = fmul float %466, 0x3FC3BF2820000000 + %multiply.53591.3 = fmul float %multiply.1032.3, %multiply.18590.3 + %add.57.i924.3 = fadd float %add.57.i924.2, %multiply.53591.3 + %467 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556423 + %468 = load i8, ptr addrspace(1) %467, align 1, !invariant.load !142 + %469 = sitofp i8 %468 to float + %multiply.18590.4 = fmul float %469, 0x3FC3BF2820000000 + %multiply.53591.4 = fmul float %multiply.1032.4, %multiply.18590.4 + %add.57.i924.4 = fadd float %add.57.i924.3, %multiply.53591.4 + %470 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556427 + %471 = load i8, ptr addrspace(1) %470, align 1, !invariant.load !142 + %472 = sitofp i8 %471 to float + %multiply.18590.5 = fmul float %472, 0x3FC3BF2820000000 + %multiply.53591.5 = fmul float %multiply.1032.5, %multiply.18590.5 + %add.57.i924.5 = fadd float %add.57.i924.4, %multiply.53591.5 + %473 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556931 + %474 = load i8, ptr addrspace(1) %473, align 1, !invariant.load !142 + %475 = sitofp i8 %474 to float + %multiply.18590.6 = fmul float %475, 0x3FC3BF2820000000 + %multiply.53591.6 = fmul float %multiply.1032.6, %multiply.18590.6 + %add.57.i924.6 = fadd float %add.57.i924.5, %multiply.53591.6 + %476 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556935 + %477 = load i8, ptr addrspace(1) %476, align 1, !invariant.load !142 + %478 = sitofp i8 %477 to float + %multiply.18590.7 = fmul float %478, 0x3FC3BF2820000000 + %multiply.53591.7 = fmul float %multiply.1032.7, %multiply.18590.7 + %add.57.i924.7 = fadd float %add.57.i924.6, %multiply.53591.7 + %479 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556939 + %480 = load i8, ptr addrspace(1) %479, align 1, !invariant.load !142 + %481 = sitofp i8 %480 to float + %multiply.18590.8 = fmul float %481, 0x3FC3BF2820000000 + %multiply.53591.8 = fmul float %multiply.1032.8, %multiply.18590.8 + %add.57.i924.8 = fadd float %add.57.i924.7, %multiply.53591.8 + %482 = fptrunc float %add.57.i924.8 to half + %483 = getelementptr inbounds half, ptr addrspace(1) %100, i64 11 + store half %482, ptr addrspace(1) %483, align 2 + %484 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833856 + %485 = load i8, ptr addrspace(1) %484, align 4, !invariant.load !142 + %486 = sitofp i8 %485 to float + %multiply.18642 = fmul float %486, 0x3FC3BF2820000000 + %multiply.53643 = fmul float %multiply.1032, %multiply.18642 + %add.57.i925 = fadd float %multiply.53643, 0.000000e+00 + %487 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833860 + %488 = load i8, ptr addrspace(1) %487, align 4, !invariant.load !142 + %489 = sitofp i8 %488 to float + %multiply.18642.1 = fmul float %489, 0x3FC3BF2820000000 + %multiply.53643.1 = fmul float %multiply.1032.1, %multiply.18642.1 + %add.57.i925.1 = fadd float %add.57.i925, %multiply.53643.1 + %490 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833864 + %491 = load i8, ptr addrspace(1) %490, align 4, !invariant.load !142 + %492 = sitofp i8 %491 to float + %multiply.18642.2 = fmul float %492, 0x3FC3BF2820000000 + %multiply.53643.2 = fmul float %multiply.1032.2, %multiply.18642.2 + %add.57.i925.2 = fadd float %add.57.i925.1, %multiply.53643.2 + %493 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834368 + %494 = load i8, ptr addrspace(1) %493, align 4, !invariant.load !142 + %495 = sitofp i8 %494 to float + %multiply.18642.3 = fmul float %495, 0x3FC3BF2820000000 + %multiply.53643.3 = fmul float %multiply.1032.3, %multiply.18642.3 + %add.57.i925.3 = fadd float %add.57.i925.2, %multiply.53643.3 + %496 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834372 + %497 = load i8, ptr addrspace(1) %496, align 4, !invariant.load !142 + %498 = sitofp i8 %497 to float + %multiply.18642.4 = fmul float %498, 0x3FC3BF2820000000 + %multiply.53643.4 = fmul float %multiply.1032.4, %multiply.18642.4 + %add.57.i925.4 = fadd float %add.57.i925.3, %multiply.53643.4 + %499 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834376 + %500 = load i8, ptr addrspace(1) %499, align 4, !invariant.load !142 + %501 = sitofp i8 %500 to float + %multiply.18642.5 = fmul float %501, 0x3FC3BF2820000000 + %multiply.53643.5 = fmul float %multiply.1032.5, %multiply.18642.5 + %add.57.i925.5 = fadd float %add.57.i925.4, %multiply.53643.5 + %502 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834880 + %503 = load i8, ptr addrspace(1) %502, align 4, !invariant.load !142 + %504 = sitofp i8 %503 to float + %multiply.18642.6 = fmul float %504, 0x3FC3BF2820000000 + %multiply.53643.6 = fmul float %multiply.1032.6, %multiply.18642.6 + %add.57.i925.6 = fadd float %add.57.i925.5, %multiply.53643.6 + %505 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834884 + %506 = load i8, ptr addrspace(1) %505, align 4, !invariant.load !142 + %507 = sitofp i8 %506 to float + %multiply.18642.7 = fmul float %507, 0x3FC3BF2820000000 + %multiply.53643.7 = fmul float %multiply.1032.7, %multiply.18642.7 + %add.57.i925.7 = fadd float %add.57.i925.6, %multiply.53643.7 + %508 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834888 + %509 = load i8, ptr addrspace(1) %508, align 4, !invariant.load !142 + %510 = sitofp i8 %509 to float + %multiply.18642.8 = fmul float %510, 0x3FC3BF2820000000 + %multiply.53643.8 = fmul float %multiply.1032.8, %multiply.18642.8 + %add.57.i925.8 = fadd float %add.57.i925.7, %multiply.53643.8 + %511 = fptrunc float %add.57.i925.8 to half + %512 = getelementptr inbounds half, ptr addrspace(1) %100, i64 12 + store half %511, ptr addrspace(1) %512, align 8 + %513 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833857 + %514 = load i8, ptr addrspace(1) %513, align 1, !invariant.load !142 + %515 = sitofp i8 %514 to float + %multiply.18694 = fmul float %515, 0x3FC3BF2820000000 + %multiply.53695 = fmul float %multiply.1032, %multiply.18694 + %add.57.i926 = fadd float %multiply.53695, 0.000000e+00 + %516 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833861 + %517 = load i8, ptr addrspace(1) %516, align 1, !invariant.load !142 + %518 = sitofp i8 %517 to float + %multiply.18694.1 = fmul float %518, 0x3FC3BF2820000000 + %multiply.53695.1 = fmul float %multiply.1032.1, %multiply.18694.1 + %add.57.i926.1 = fadd float %add.57.i926, %multiply.53695.1 + %519 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833865 + %520 = load i8, ptr addrspace(1) %519, align 1, !invariant.load !142 + %521 = sitofp i8 %520 to float + %multiply.18694.2 = fmul float %521, 0x3FC3BF2820000000 + %multiply.53695.2 = fmul float %multiply.1032.2, %multiply.18694.2 + %add.57.i926.2 = fadd float %add.57.i926.1, %multiply.53695.2 + %522 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834369 + %523 = load i8, ptr addrspace(1) %522, align 1, !invariant.load !142 + %524 = sitofp i8 %523 to float + %multiply.18694.3 = fmul float %524, 0x3FC3BF2820000000 + %multiply.53695.3 = fmul float %multiply.1032.3, %multiply.18694.3 + %add.57.i926.3 = fadd float %add.57.i926.2, %multiply.53695.3 + %525 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834373 + %526 = load i8, ptr addrspace(1) %525, align 1, !invariant.load !142 + %527 = sitofp i8 %526 to float + %multiply.18694.4 = fmul float %527, 0x3FC3BF2820000000 + %multiply.53695.4 = fmul float %multiply.1032.4, %multiply.18694.4 + %add.57.i926.4 = fadd float %add.57.i926.3, %multiply.53695.4 + %528 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834377 + %529 = load i8, ptr addrspace(1) %528, align 1, !invariant.load !142 + %530 = sitofp i8 %529 to float + %multiply.18694.5 = fmul float %530, 0x3FC3BF2820000000 + %multiply.53695.5 = fmul float %multiply.1032.5, %multiply.18694.5 + %add.57.i926.5 = fadd float %add.57.i926.4, %multiply.53695.5 + %531 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834881 + %532 = load i8, ptr addrspace(1) %531, align 1, !invariant.load !142 + %533 = sitofp i8 %532 to float + %multiply.18694.6 = fmul float %533, 0x3FC3BF2820000000 + %multiply.53695.6 = fmul float %multiply.1032.6, %multiply.18694.6 + %add.57.i926.6 = fadd float %add.57.i926.5, %multiply.53695.6 + %534 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834885 + %535 = load i8, ptr addrspace(1) %534, align 1, !invariant.load !142 + %536 = sitofp i8 %535 to float + %multiply.18694.7 = fmul float %536, 0x3FC3BF2820000000 + %multiply.53695.7 = fmul float %multiply.1032.7, %multiply.18694.7 + %add.57.i926.7 = fadd float %add.57.i926.6, %multiply.53695.7 + %537 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834889 + %538 = load i8, ptr addrspace(1) %537, align 1, !invariant.load !142 + %539 = sitofp i8 %538 to float + %multiply.18694.8 = fmul float %539, 0x3FC3BF2820000000 + %multiply.53695.8 = fmul float %multiply.1032.8, %multiply.18694.8 + %add.57.i926.8 = fadd float %add.57.i926.7, %multiply.53695.8 + %540 = fptrunc float %add.57.i926.8 to half + %541 = getelementptr inbounds half, ptr addrspace(1) %100, i64 13 + store half %540, ptr addrspace(1) %541, align 2 + %542 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833858 + %543 = load i8, ptr addrspace(1) %542, align 2, !invariant.load !142 + %544 = sitofp i8 %543 to float + %multiply.18746 = fmul float %544, 0x3FC3BF2820000000 + %multiply.53747 = fmul float %multiply.1032, %multiply.18746 + %add.57.i927 = fadd float %multiply.53747, 0.000000e+00 + %545 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833862 + %546 = load i8, ptr addrspace(1) %545, align 2, !invariant.load !142 + %547 = sitofp i8 %546 to float + %multiply.18746.1 = fmul float %547, 0x3FC3BF2820000000 + %multiply.53747.1 = fmul float %multiply.1032.1, %multiply.18746.1 + %add.57.i927.1 = fadd float %add.57.i927, %multiply.53747.1 + %548 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833866 + %549 = load i8, ptr addrspace(1) %548, align 2, !invariant.load !142 + %550 = sitofp i8 %549 to float + %multiply.18746.2 = fmul float %550, 0x3FC3BF2820000000 + %multiply.53747.2 = fmul float %multiply.1032.2, %multiply.18746.2 + %add.57.i927.2 = fadd float %add.57.i927.1, %multiply.53747.2 + %551 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834370 + %552 = load i8, ptr addrspace(1) %551, align 2, !invariant.load !142 + %553 = sitofp i8 %552 to float + %multiply.18746.3 = fmul float %553, 0x3FC3BF2820000000 + %multiply.53747.3 = fmul float %multiply.1032.3, %multiply.18746.3 + %add.57.i927.3 = fadd float %add.57.i927.2, %multiply.53747.3 + %554 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834374 + %555 = load i8, ptr addrspace(1) %554, align 2, !invariant.load !142 + %556 = sitofp i8 %555 to float + %multiply.18746.4 = fmul float %556, 0x3FC3BF2820000000 + %multiply.53747.4 = fmul float %multiply.1032.4, %multiply.18746.4 + %add.57.i927.4 = fadd float %add.57.i927.3, %multiply.53747.4 + %557 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834378 + %558 = load i8, ptr addrspace(1) %557, align 2, !invariant.load !142 + %559 = sitofp i8 %558 to float + %multiply.18746.5 = fmul float %559, 0x3FC3BF2820000000 + %multiply.53747.5 = fmul float %multiply.1032.5, %multiply.18746.5 + %add.57.i927.5 = fadd float %add.57.i927.4, %multiply.53747.5 + %560 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834882 + %561 = load i8, ptr addrspace(1) %560, align 2, !invariant.load !142 + %562 = sitofp i8 %561 to float + %multiply.18746.6 = fmul float %562, 0x3FC3BF2820000000 + %multiply.53747.6 = fmul float %multiply.1032.6, %multiply.18746.6 + %add.57.i927.6 = fadd float %add.57.i927.5, %multiply.53747.6 + %563 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834886 + %564 = load i8, ptr addrspace(1) %563, align 2, !invariant.load !142 + %565 = sitofp i8 %564 to float + %multiply.18746.7 = fmul float %565, 0x3FC3BF2820000000 + %multiply.53747.7 = fmul float %multiply.1032.7, %multiply.18746.7 + %add.57.i927.7 = fadd float %add.57.i927.6, %multiply.53747.7 + %566 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834890 + %567 = load i8, ptr addrspace(1) %566, align 2, !invariant.load !142 + %568 = sitofp i8 %567 to float + %multiply.18746.8 = fmul float %568, 0x3FC3BF2820000000 + %multiply.53747.8 = fmul float %multiply.1032.8, %multiply.18746.8 + %add.57.i927.8 = fadd float %add.57.i927.7, %multiply.53747.8 + %569 = fptrunc float %add.57.i927.8 to half + %570 = getelementptr inbounds half, ptr addrspace(1) %100, i64 14 + store half %569, ptr addrspace(1) %570, align 4 + %571 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833859 + %572 = load i8, ptr addrspace(1) %571, align 1, !invariant.load !142 + %573 = sitofp i8 %572 to float + %multiply.18798 = fmul float %573, 0x3FC3BF2820000000 + %multiply.53799 = fmul float %multiply.1032, %multiply.18798 + %add.57.i928 = fadd float %multiply.53799, 0.000000e+00 + %574 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833863 + %575 = load i8, ptr addrspace(1) %574, align 1, !invariant.load !142 + %576 = sitofp i8 %575 to float + %multiply.18798.1 = fmul float %576, 0x3FC3BF2820000000 + %multiply.53799.1 = fmul float %multiply.1032.1, %multiply.18798.1 + %add.57.i928.1 = fadd float %add.57.i928, %multiply.53799.1 + %577 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833867 + %578 = load i8, ptr addrspace(1) %577, align 1, !invariant.load !142 + %579 = sitofp i8 %578 to float + %multiply.18798.2 = fmul float %579, 0x3FC3BF2820000000 + %multiply.53799.2 = fmul float %multiply.1032.2, %multiply.18798.2 + %add.57.i928.2 = fadd float %add.57.i928.1, %multiply.53799.2 + %580 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834371 + %581 = load i8, ptr addrspace(1) %580, align 1, !invariant.load !142 + %582 = sitofp i8 %581 to float + %multiply.18798.3 = fmul float %582, 0x3FC3BF2820000000 + %multiply.53799.3 = fmul float %multiply.1032.3, %multiply.18798.3 + %add.57.i928.3 = fadd float %add.57.i928.2, %multiply.53799.3 + %583 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834375 + %584 = load i8, ptr addrspace(1) %583, align 1, !invariant.load !142 + %585 = sitofp i8 %584 to float + %multiply.18798.4 = fmul float %585, 0x3FC3BF2820000000 + %multiply.53799.4 = fmul float %multiply.1032.4, %multiply.18798.4 + %add.57.i928.4 = fadd float %add.57.i928.3, %multiply.53799.4 + %586 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834379 + %587 = load i8, ptr addrspace(1) %586, align 1, !invariant.load !142 + %588 = sitofp i8 %587 to float + %multiply.18798.5 = fmul float %588, 0x3FC3BF2820000000 + %multiply.53799.5 = fmul float %multiply.1032.5, %multiply.18798.5 + %add.57.i928.5 = fadd float %add.57.i928.4, %multiply.53799.5 + %589 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834883 + %590 = load i8, ptr addrspace(1) %589, align 1, !invariant.load !142 + %591 = sitofp i8 %590 to float + %multiply.18798.6 = fmul float %591, 0x3FC3BF2820000000 + %multiply.53799.6 = fmul float %multiply.1032.6, %multiply.18798.6 + %add.57.i928.6 = fadd float %add.57.i928.5, %multiply.53799.6 + %592 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834887 + %593 = load i8, ptr addrspace(1) %592, align 1, !invariant.load !142 + %594 = sitofp i8 %593 to float + %multiply.18798.7 = fmul float %594, 0x3FC3BF2820000000 + %multiply.53799.7 = fmul float %multiply.1032.7, %multiply.18798.7 + %add.57.i928.7 = fadd float %add.57.i928.6, %multiply.53799.7 + %595 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834891 + %596 = load i8, ptr addrspace(1) %595, align 1, !invariant.load !142 + %597 = sitofp i8 %596 to float + %multiply.18798.8 = fmul float %597, 0x3FC3BF2820000000 + %multiply.53799.8 = fmul float %multiply.1032.8, %multiply.18798.8 + %add.57.i928.8 = fadd float %add.57.i928.7, %multiply.53799.8 + %598 = fptrunc float %add.57.i928.8 to half + %599 = getelementptr inbounds half, ptr addrspace(1) %100, i64 15 + store half %598, ptr addrspace(1) %599, align 2 + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!140 = !{i32 0, i32 8658} +!141 = !{i32 0, i32 64} +!142 = !{} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll new file mode 100644 index 000000000000..e38abec4a686 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll @@ -0,0 +1,17 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +; CHECK-LABEL: @overlapping_stores +; CHECK: store i16 +; CHECK: store i16 +; CHECK: store i16 +define void @overlapping_stores(ptr nocapture align 2 %ptr) { + %ptr0 = getelementptr i16, ptr %ptr, i64 0 + %ptr1 = getelementptr i8, ptr %ptr, i64 1 + %ptr2 = getelementptr i16, ptr %ptr, i64 1 + + store i16 0, ptr %ptr0, align 2 + store i16 0, ptr %ptr1, align 1 + store i16 0, ptr %ptr2, align 2 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll new file mode 100644 index 000000000000..6d2a462e2532 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll @@ -0,0 +1,33 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +define void @i1x8(ptr nocapture align 4 %ptr) { + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr1 = getelementptr i8, ptr %ptr, i64 1 + %ptr2 = getelementptr i8, ptr %ptr, i64 2 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l1 = load <8 x i1>, ptr %ptr1, align 1 + %l2 = load <8 x i1>, ptr %ptr2, align 2 + %l3 = load <8 x i1>, ptr %ptr3, align 1 + + ret void + +; CHECK-LABEL: @i1x8 +; CHECK-DAG: load <32 x i1> +} + +define void @i1x8x16x8(ptr nocapture align 4 %ptr) { + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr1 = getelementptr i8, ptr %ptr, i64 1 + %ptr2 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l2 = load <16 x i1>, ptr %ptr1, align 1 + %l3 = load <8 x i1>, ptr %ptr2, align 1 + + ret void + +; CHECK-LABEL: @i1x8x16x8 +; CHECK-DAG: load <32 x i1> +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll new file mode 100644 index 000000000000..8b7537680f89 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll @@ -0,0 +1,17 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +; CHECK-LABEL: @int16x2 +; CHECK: load <2 x i16> +; CHECK: store <2 x i16> +define void @int16x2(ptr nocapture align 4 %ptr) { + %ptr0 = getelementptr i16, ptr %ptr, i64 0 + %ptr1 = getelementptr i16, ptr %ptr, i64 1 + + %l0 = load i16, ptr %ptr0, align 4 + %l1 = load i16, ptr %ptr1, align 2 + + store i16 %l1, ptr %ptr0, align 4 + store i16 %l0, ptr %ptr1, align 2 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll new file mode 100644 index 000000000000..c90301d86896 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll @@ -0,0 +1,21 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +; We don't need to vectorize this. Just make sure it doesn't crash. + +; CHECK-LABEL: @int24x2 +; CHECK: load i24 +; CHECK: load i24 +; CHECK: store i24 +; CHECK: store i24 +define void @int24x2(ptr nocapture align 4 %ptr) { + %ptr0 = getelementptr i24, ptr %ptr, i64 0 + %ptr1 = getelementptr i24, ptr %ptr, i64 1 + + %l0 = load i24, ptr %ptr0, align 4 + %l1 = load i24, ptr %ptr1, align 1 + + store i24 %l1, ptr %ptr0, align 4 + store i24 %l0, ptr %ptr1, align 1 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll index 387d6789a4c4..2d3c289c2a12 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s ; Vectorize and emit valid code (Issue #54896). @@ -41,8 +40,10 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) { ret void ; CHECK-LABEL: @int8x3a4 -; CHECK: load <3 x i8> -; CHECK: store <3 x i8> +; CHECK: load <2 x i8> +; CHECK: load i8 +; CHECK: store <2 x i8> +; CHECK: store i8 } define void @int8x12a4(ptr nocapture align 4 %ptr) { diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll new file mode 100644 index 000000000000..91242f6e50b4 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll @@ -0,0 +1,17 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +; CHECK-LABEL: @int8x3Plus1 +; CHECK: load <4 x i8> +; CHECK: store <4 x i8> +define void @int8x3Plus1(ptr nocapture align 4 %ptr) { + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <3 x i8>, ptr %ptr0, align 4 + %l1 = load i8, ptr %ptr3, align 1 + + store <3 x i8> , ptr %ptr0, align 4 + store i8 0, ptr %ptr3, align 1 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll index 3dea7f3520c0..0fc2ac01e931 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -7,12 +7,13 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" define void @correct_order(ptr noalias %ptr) { ; CHECK-LABEL: @correct_order( ; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[NEXT_GEP1]], align 4 -; CHECK-NEXT: [[L11:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[L42:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[PTR]], align 4 +; CHECK-NEXT: [[L21:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[PTR]], align 4 -; CHECK-NEXT: [[L3:%.*]] = load i32, ptr [[NEXT_GEP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[NEXT_GEP1]], align 4 +; CHECK-NEXT: [[L33:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: ret void ; %next.gep1 = getelementptr i32, ptr %ptr, i64 1 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll index e4085747956a..bc1f8d3880fd 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -8,9 +8,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: @interleave_2L_2S( ; CHECK: load <2 x i32> -; CHECK: load i32 ; CHECK: store <2 x i32> -; CHECK: load i32 +; CHECK: load <2 x i32> define void @interleave_2L_2S(ptr noalias %ptr) { %next.gep1 = getelementptr i32, ptr %ptr, i64 1 %next.gep2 = getelementptr i32, ptr %ptr, i64 2 @@ -26,9 +25,9 @@ define void @interleave_2L_2S(ptr noalias %ptr) { } ; CHECK-LABEL: @interleave_3L_2S_1L( -; CHECK: load <3 x i32> +; CHECK: load <2 x i32> ; CHECK: store <2 x i32> -; CHECK: load i32 +; CHECK: load <2 x i32> define void @interleave_3L_2S_1L(ptr noalias %ptr) { %next.gep1 = getelementptr i32, ptr %ptr, i64 1 @@ -82,15 +81,10 @@ define void @chain_prefix_suffix(ptr noalias %ptr) { ret void } -; FIXME: If the chain is too long and TLI says misaligned is not fast, -; then LSV fails to vectorize anything in that chain. -; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. - ; CHECK-LABEL: @interleave_get_longest -; CHECK: load <3 x i32> -; CHECK: load i32 +; CHECK: load <2 x i32> ; CHECK: store <2 x i32> zeroinitializer -; CHECK: load i32 +; CHECK: load <3 x i32> ; CHECK: load i32 ; CHECK: load i32 @@ -98,6 +92,7 @@ define void @interleave_get_longest(ptr noalias %ptr) { %tmp2 = getelementptr i32, ptr %ptr, i64 1 %tmp3 = getelementptr i32, ptr %ptr, i64 2 %tmp4 = getelementptr i32, ptr %ptr, i64 3 + %tmp5 = getelementptr i32, ptr %ptr, i64 4 %l1 = load i32, ptr %tmp2, align 4 %l2 = load i32, ptr %ptr, align 4 @@ -106,8 +101,32 @@ define void @interleave_get_longest(ptr noalias %ptr) { %l3 = load i32, ptr %tmp2, align 4 %l4 = load i32, ptr %tmp3, align 4 %l5 = load i32, ptr %tmp4, align 4 - %l6 = load i32, ptr %tmp4, align 4 - %l7 = load i32, ptr %tmp4, align 4 + %l6 = load i32, ptr %tmp5, align 4 + %l7 = load i32, ptr %tmp5, align 4 ret void } + +; CHECK-LABEL: @interleave_get_longest_aligned +; CHECK: load <2 x i32> +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load <4 x i32> + +define void @interleave_get_longest_aligned(ptr noalias %ptr) { + %tmp2 = getelementptr i32, ptr %ptr, i64 1 + %tmp3 = getelementptr i32, ptr %ptr, i64 2 + %tmp4 = getelementptr i32, ptr %ptr, i64 3 + %tmp5 = getelementptr i32, ptr %ptr, i64 4 + + %l1 = load i32, ptr %tmp2, align 4 + %l2 = load i32, ptr %ptr, align 4 + store i32 0, ptr %tmp2, align 4 + store i32 0, ptr %ptr, align 4 + %l3 = load i32, ptr %tmp2, align 16 + %l4 = load i32, ptr %tmp3, align 4 + %l5 = load i32, ptr %tmp4, align 8 + %l6 = load i32, ptr %tmp5, align 4 + %l7 = load i32, ptr %tmp5, align 4 + + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll index d1a1fa45c741..019aeea8f0a5 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll @@ -4,8 +4,7 @@ ; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T. ; CHECK-LABEL: @vector_scalar( -; CHECK: store double -; CHECK: store <1 x double> +; CHECK: store <2 x double> define void @vector_scalar(ptr %ptr, double %a, <1 x double> %b) { %1 = getelementptr <1 x double>, ptr %ptr, i32 1 store double %a, ptr %ptr, align 8 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll index 37e9ec7f0f46..6b9229e8dfd6 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll @@ -55,53 +55,6 @@ bb: ret void } -define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, ptr %src, ptr %dst) { -; CHECK-LABEL: @ld_v4i8_add_nuw( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 -; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: ret void -; -bb: - %tmp = add nuw i32 %v0, -1 - %tmp1 = add nuw i32 %v1, %tmp - %tmp2 = zext i32 %tmp1 to i64 - %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 - %tmp4 = load i8, ptr %tmp3, align 1 - %tmp5 = add nuw i32 %v1, %v0 - %tmp6 = zext i32 %tmp5 to i64 - %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 - %tmp8 = load i8, ptr %tmp7, align 1 - %tmp9 = add nuw i32 %v0, 1 - %tmp10 = add nuw i32 %v1, %tmp9 - %tmp11 = zext i32 %tmp10 to i64 - %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 - %tmp13 = load i8, ptr %tmp12, align 1 - %tmp14 = add nuw i32 %v0, 2 - %tmp15 = add nuw i32 %v1, %tmp14 - %tmp16 = zext i32 %tmp15 to i64 - %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 - %tmp18 = load i8, ptr %tmp17, align 1 - %tmp19 = insertelement <4 x i8> poison, i8 %tmp4, i32 0 - %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 - %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 - %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 - store <4 x i8> %tmp22, ptr %dst - ret void -} - ; Make sure we don't vectorize the loads below because the source of ; sext instructions doesn't have the nsw flag. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll index c931a6b181ac..cf575cef2d1b 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll @@ -55,53 +55,6 @@ bb: ret void } -define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, ptr %src, ptr %dst) { -; CHECK-LABEL: @ld_v4i8_add_nuw( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 -; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] -; CHECK-NEXT: ret void -; -bb: - %tmp = add nuw i32 %v0, -1 - %tmp1 = add nuw i32 %v1, %tmp - %tmp2 = zext i32 %tmp1 to i64 - %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 - %tmp4 = load i8, ptr %tmp3, align 1 - %tmp5 = add nuw i32 %v1, %v0 - %tmp6 = zext i32 %tmp5 to i64 - %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 - %tmp8 = load i8, ptr %tmp7, align 1 - %tmp9 = add nuw i32 %v0, 1 - %tmp10 = add nuw i32 %v1, %tmp9 - %tmp11 = zext i32 %tmp10 to i64 - %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 - %tmp13 = load i8, ptr %tmp12, align 1 - %tmp14 = add nuw i32 %v0, 2 - %tmp15 = add nuw i32 %v1, %tmp14 - %tmp16 = zext i32 %tmp15 to i64 - %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 - %tmp18 = load i8, ptr %tmp17, align 1 - %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 - %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 - %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 - %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 - store <4 x i8> %tmp22, ptr %dst - ret void -} - ; Apply different operand orders for the nested add sequences define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) { ; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders( @@ -150,54 +103,6 @@ bb: ret void } -; Apply different operand orders for the nested add sequences -define void @ld_v4i8_add_nuw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) { -; CHECK-LABEL: @ld_v4i8_add_nuw_operand_orders( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 -; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] -; CHECK-NEXT: ret void -; -bb: - %tmp = add nuw i32 %v0, -1 - %tmp1 = add nuw i32 %v1, %tmp - %tmp2 = zext i32 %tmp1 to i64 - %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 - %tmp4 = load i8, ptr %tmp3, align 1 - %tmp5 = add nuw i32 %v0, %v1 - %tmp6 = zext i32 %tmp5 to i64 - %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 - %tmp8 = load i8, ptr %tmp7, align 1 - %tmp9 = add nuw i32 %v0, 1 - %tmp10 = add nuw i32 %tmp9, %v1 - %tmp11 = zext i32 %tmp10 to i64 - %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 - %tmp13 = load i8, ptr %tmp12, align 1 - %tmp14 = add nuw i32 %v0, 2 - %tmp15 = add nuw i32 %v1, %tmp14 - %tmp16 = zext i32 %tmp15 to i64 - %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 - %tmp18 = load i8, ptr %tmp17, align 1 - %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 - %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 - %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 - %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 - store <4 x i8> %tmp22, ptr %dst - ret void -} - define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { ; CHECK-LABEL: @ld_v4i8_add_known_bits( ; CHECK-NEXT: bb: diff --git a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll index 8479ce8327c7..b20d3ca4f602 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll @@ -78,9 +78,9 @@ define void @test_inaccessiblememonly_not_willreturn(ptr %p) { ; CHECK-NEXT: [[P2:%.*]] = getelementptr float, ptr [[P]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr float, ptr [[P]], i64 3 ; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[P]], align 16 +; CHECK-NEXT: call void @foo() #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[P1]], align 4 ; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[P2]], align 4 -; CHECK-NEXT: call void @foo() #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[P3]], align 4 ; CHECK-NEXT: store float [[L0]], ptr [[P]], align 16 ; CHECK-NEXT: call void @foo() #[[ATTR2]] @@ -93,9 +93,9 @@ define void @test_inaccessiblememonly_not_willreturn(ptr %p) { %p2 = getelementptr float, ptr %p, i64 2 %p3 = getelementptr float, ptr %p, i64 3 %l0 = load float, ptr %p, align 16 + call void @foo() inaccessiblememonly nounwind %l1 = load float, ptr %p1 %l2 = load float, ptr %p2 - call void @foo() inaccessiblememonly nounwind %l3 = load float, ptr %p3 store float %l0, ptr %p, align 16 call void @foo() inaccessiblememonly nounwind @@ -111,9 +111,9 @@ define void @test_inaccessiblememonly_not_nounwind(ptr %p) { ; CHECK-NEXT: [[P2:%.*]] = getelementptr float, ptr [[P]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr float, ptr [[P]], i64 3 ; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[P]], align 16 +; CHECK-NEXT: call void @foo() #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[P1]], align 4 ; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[P2]], align 4 -; CHECK-NEXT: call void @foo() #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[P3]], align 4 ; CHECK-NEXT: store float [[L0]], ptr [[P]], align 16 ; CHECK-NEXT: call void @foo() #[[ATTR3]] @@ -126,9 +126,9 @@ define void @test_inaccessiblememonly_not_nounwind(ptr %p) { %p2 = getelementptr float, ptr %p, i64 2 %p3 = getelementptr float, ptr %p, i64 3 %l0 = load float, ptr %p, align 16 + call void @foo() inaccessiblememonly willreturn %l1 = load float, ptr %p1 %l2 = load float, ptr %p2 - call void @foo() inaccessiblememonly willreturn %l3 = load float, ptr %p3 store float %l0, ptr %p, align 16 call void @foo() inaccessiblememonly willreturn