From 22f9ebd2c1c13386fe196c00e60bb3ebafc1b361 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 25 Apr 2024 14:54:28 +0100 Subject: [PATCH] WIP histogram autovec Mostly functioning all-in-one intrinsic autovec --- .../llvm/Analysis/LoopAccessAnalysis.h | 26 +++- .../llvm/Analysis/TargetTransformInfo.h | 8 + .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 + .../Vectorize/LoopVectorizationLegality.h | 17 +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp | 142 +++++++++++++++++- llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + .../AArch64/AArch64TargetTransformInfo.cpp | 30 ++++ .../AArch64/AArch64TargetTransformInfo.h | 2 + .../Vectorize/LoopVectorizationLegality.cpp | 9 ++ .../Transforms/Vectorize/LoopVectorize.cpp | 51 ++++++- .../Transforms/Vectorize/VPRecipeBuilder.h | 6 + llvm/lib/Transforms/Vectorize/VPlan.h | 25 +++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 32 ++++ llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../LoopVectorize/AArch64/sve2-histcnt.ll | 43 +++++- 16 files changed, 386 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 6ebd0fb8477a0..f25cf106b297a 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -200,7 +200,8 @@ class MemoryDepChecker { bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap &Strides, const DenseMap> - &UnderlyingObjects); + &UnderlyingObjects, + const SmallPtrSetImpl &HistogramPtrs); /// No memory dependence was encountered that would inhibit /// vectorization. @@ -338,7 +339,8 @@ class MemoryDepChecker { isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, const DenseMap &Strides, const DenseMap> - &UnderlyingObjects); + &UnderlyingObjects, + const SmallPtrSetImpl &HistogramPtrs); /// Check whether the data dependence could prevent store-load /// forwarding. @@ -402,6 +404,15 @@ struct PointerDiffInfo { NeedsFreeze(NeedsFreeze) {} }; +struct HistogramInfo { + Instruction *Load; + Instruction *Update; + Instruction *Store; + + HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store) + : Load(Load), Update(Update), Store(Store) {} +}; + /// Holds information about the memory runtime legality checks to verify /// that a group of pointers do not overlap. class RuntimePointerChecking { @@ -621,6 +632,10 @@ class LoopAccessInfo { unsigned getNumStores() const { return NumStores; } unsigned getNumLoads() const { return NumLoads;} + const SmallVectorImpl &getHistograms() const { + return Histograms; + } + /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. const OptimizationRemarkAnalysis *getReport() const { return Report.get(); } @@ -733,6 +748,13 @@ class LoopAccessInfo { /// If an access has a symbolic strides, this maps the pointer value to /// the stride symbol. DenseMap SymbolicStrides; + + /// Holds the load, update, and store instructions for all histogram-style + /// operations found in the loop. + SmallVector Histograms; + + /// Storing Histogram Pointers + SmallPtrSet HistogramPtrs; }; /// Return the SCEV corresponding to a pointer with the symbolic stride diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0c3a6b3742c73..ea07a4e2c39b7 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -985,6 +985,9 @@ class TargetTransformInfo { /// Return hardware support for population count. PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; + /// Returns the cost of generating a vector histogram. + InstructionCost getHistogramCost(Type *Ty) const; + /// Return true if the hardware has a fast square-root instruction. bool haveFastSqrt(Type *Ty) const; @@ -1934,6 +1937,7 @@ class TargetTransformInfo::Concept { unsigned *Fast) = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; + virtual InstructionCost getHistogramCost(Type *Ty) = 0; virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0; virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0; virtual InstructionCost getFPOpCost(Type *Ty) = 0; @@ -2497,6 +2501,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { } bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); } + InstructionCost getHistogramCost(Type *Ty) override { + return Impl.getHistogramCost(Ty); + } + bool isExpensiveToSpeculativelyExecute(const Instruction* I) override { return Impl.isExpensiveToSpeculativelyExecute(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9a57331d281db..2cf5c7e30462d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -418,6 +418,10 @@ class TargetTransformInfoImplBase { bool haveFastSqrt(Type *Ty) const { return false; } + InstructionCost getHistogramCost(Type *Ty) const { + return InstructionCost::getInvalid(); + } + bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; } bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2091432d4fe27..bcd30889d5799 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -540,6 +540,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); } + InstructionCost getHistogramCost(Type *Ty) { + return InstructionCost::getInvalid(); + } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; } diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index a509ebf6a7e1b..3ac73232c0f7b 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -387,6 +387,23 @@ class LoopVectorizationLegality { unsigned getNumStores() const { return LAI->getNumStores(); } unsigned getNumLoads() const { return LAI->getNumLoads(); } + bool isHistogramLoadOrUpdate(Instruction *I) const { + for (const HistogramInfo &HGram : LAI->getHistograms()) + if (HGram.Load == I || HGram.Update == I) + return true; + + return false; + } + + std::optional + getHistogramForStore(StoreInst *SI) const { + for (const HistogramInfo &HGram : LAI->getHistograms()) + if (HGram.Store == SI) + return &HGram; + + return std::nullopt; + } + PredicatedScalarEvolution *getPredicatedScalarEvolution() const { return &PSE; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index d071e53324408..a4e563a3f6f26 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/LoopAnalysisManager.h" @@ -70,6 +71,8 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-accesses" +STATISTIC(HistogramsDetected, "Number of Histograms detected"); + static cl::opt VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), @@ -731,6 +734,23 @@ class AccessAnalysis { return UnderlyingObjects; } + /// Find Histogram counts that match high-level code in loops: + /// \code + /// buckets[indices[i]]+=step; + /// \endcode + /// + /// It matches a pattern starting from \p HSt, which Stores to the 'buckets' + /// array the computed histogram. It uses a BinOp to sum all counts, storing + /// them using a loop-variant index Load from the 'indices' input array. + /// + /// On successful matches it updates the STATISTIC 'HistogramsDetected', + /// regardless of hardware support. When there is support, it additionally + /// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers + /// used to update histogram in \p HistogramPtrs. + void findHistograms(StoreInst *HSt, + SmallVectorImpl &Histograms, + SmallPtrSetImpl &HistogramPtrs); + private: typedef MapVector> PtrAccessMap; @@ -1948,7 +1968,8 @@ getDependenceDistanceStrideAndSize( const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap &Strides, const DenseMap> &UnderlyingObjects, - PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) { + PredicatedScalarEvolution &PSE, const Loop *InnermostLoop, + const SmallPtrSetImpl &HistogramPtrs) { auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); auto &SE = *PSE.getSE(); auto [APtr, AIsWrite] = A; @@ -1966,6 +1987,15 @@ getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; + // Ignore Histogram count updates as they are handled by the Intrinsic. This + // happens when the same pointer is first used to read from and then is used + // to write to. + if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) { + LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: " + << *APtr); + return MemoryDepChecker::Dependence::NoDep; + } + int64_t StrideAPtr = getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0); int64_t StrideBPtr = @@ -2022,15 +2052,15 @@ getDependenceDistanceStrideAndSize( MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, const DenseMap &Strides, - const DenseMap> - &UnderlyingObjects) { + const DenseMap> &UnderlyingObjects, + const SmallPtrSetImpl &HistogramPtrs) { assert(AIdx < BIdx && "Must pass arguments in program order"); // Get the dependence distance, stride, type size and what access writes for // the dependence between A and B. auto Res = getDependenceDistanceStrideAndSize( A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE, - InnermostLoop); + InnermostLoop, HistogramPtrs); if (std::holds_alternative(Res)) return std::get(Res); @@ -2266,8 +2296,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( bool MemoryDepChecker::areDepsSafe( DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap &Strides, - const DenseMap> - &UnderlyingObjects) { + const DenseMap> &UnderlyingObjects, + const SmallPtrSetImpl &HistogramPtrs) { MinDepDistBytes = -1; SmallPtrSet Visited; @@ -2312,7 +2342,7 @@ bool MemoryDepChecker::areDepsSafe( Dependence::DepType Type = isDependent(*A.first, A.second, *B.first, B.second, Strides, - UnderlyingObjects); + UnderlyingObjects, HistogramPtrs); mergeInStatus(Dependence::isSafeForVectorization(Type)); // Gather dependences unless we accumulated MaxDependences @@ -2648,6 +2678,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, // check. Accesses.buildDependenceSets(); + for (StoreInst *ST : Stores) + Accesses.findHistograms(ST, Histograms, HistogramPtrs); + // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. Value *UncomputablePtr = nullptr; @@ -2672,7 +2705,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n"); CanVecMem = DepChecker->areDepsSafe( DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides, - Accesses.getUnderlyingObjects()); + Accesses.getUnderlyingObjects(), HistogramPtrs); if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) { LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n"); @@ -3127,6 +3160,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) { return *I.first->second; } +void AccessAnalysis::findHistograms( + StoreInst *HSt, SmallVectorImpl &Histograms, + SmallPtrSetImpl &HistogramPtrs) { + LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt + << "\n"); + // Store value must come from a Binary Operation. + Instruction *HPtrInstr = nullptr; + BinaryOperator *HBinOp = nullptr; + if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) { + LLVM_DEBUG(dbgs() << "\tNo BinOp\n"); + return; + } + + // BinOp must be an Add or a Sub operating modifying the bucket value by a + // loop invariant amount. + // FIXME: We assume the loop invariant term is on the RHS. + // Fine for an immediate/constant, but maybe not a generic value? + Value *HIncVal = nullptr; + if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) && + !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) { + LLVM_DEBUG(dbgs() << "\tNo matching load\n"); + return; + } + Instruction *IndexedLoad = cast(HBinOp->getOperand(0)); + + // The address to store is calculated through a GEP Instruction. + // FIXME: Support GEPs with more operands. + GetElementPtrInst *HPtr = dyn_cast(HPtrInstr); + if (!HPtr || HPtr->getNumOperands() > 2) { + LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n"); + return; + } + + // Check that the index is calculated by loading from another array. Ignore + // any extensions. + // FIXME: Support indices from other sources that a linear load from memory? + Value *HIdx = HPtr->getOperand(1); + Instruction *IdxInst = nullptr; + // FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to + // look through extensions, find another way? + if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst)))) + return; + + // Currently restricting this to linear addressing when loading indices. + LoadInst *VLoad = dyn_cast(IdxInst); + Value *VPtrVal; + if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) { + LLVM_DEBUG(dbgs() << "\tBad Index Load\n"); + return; + } + + if (!isa(PSE.getSCEV(VPtrVal))) { + LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n"); + return; + } + + // FIXME: support smaller types of input arrays. Integers can be promoted + // for codegen. + Type *VLoadTy = VLoad->getType(); + if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 && + VLoadTy->getScalarSizeInBits() != 64)) { + LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n"); + return; + } + + // Ensure we'll have the same mask by checking that all parts of the histogram + // are in the same block. + // FIXME: Could use dominance checks instead? + if (IndexedLoad->getParent() != HBinOp->getParent() || + IndexedLoad->getParent() != HSt->getParent()) { + LLVM_DEBUG(dbgs() << "\tDifferent parent blocks\n"); + return; + } + + // A histogram pointer may only alias to itself, and must only have two uses, + // the load and the store. + for (AliasSet &AS : AST) + if (AS.isMustAlias() || AS.isMayAlias()) + if ((is_contained(AS.getPointers(), HPtr) && AS.size() > 1) || + HPtr->getNumUses() != 2) { + LLVM_DEBUG(dbgs() << "\tAliasing problem\n"); + return; + } + + LLVM_DEBUG(dbgs() << "LAA: Found Histogram Operation: " << *HBinOp << "\n"); + HistogramsDetected++; + + // Store the operations that make up the histogram. + Histograms.emplace_back(IndexedLoad, HBinOp, HSt); + // Store pointers used to write those counts in the computed histogram. + HistogramPtrs.insert(HPtr); +} + bool LoopAccessInfoManager::invalidate( Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index f6a458f7ded46..94dce50c0e5f4 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -658,6 +658,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const { return TTIImpl->haveFastSqrt(Ty); } +InstructionCost TargetTransformInfo::getHistogramCost(Type *Ty) const { + return TTIImpl->getHistogramCost(Ty); +} + bool TargetTransformInfo::isExpensiveToSpeculativelyExecute( const Instruction *I) const { return TTIImpl->isExpensiveToSpeculativelyExecute(I); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f49c73dc79519..6912f7bc4084e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -58,6 +58,11 @@ static cl::opt InlineCallPenaltyChangeSM( static cl::opt EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden); +// A complete guess as to a reasonable cost. +static cl::opt + BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, + cl::desc("The cost of a histcnt instruction")); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -505,6 +510,31 @@ static bool isUnpackedVectorVT(EVT VecVT) { VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; } +InstructionCost AArch64TTIImpl::getHistogramCost(Type *Ty) const { + if (!ST->hasSVE2orSME()) + return InstructionCost::getInvalid(); + + Type *EltTy = Ty->getScalarType(); + + // Only allow (<=64b) integers or pointers for now... + if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || + EltTy->getScalarSizeInBits() > 64) + return InstructionCost::getInvalid(); + + // FIXME: Hacky check for legal vector types. We can promote smaller types + // but we cannot legalize vectors via splitting for histcnt. + // FIXME: We should be able to generate histcnt for fixed-length vectors + // using ptrue with a specific VL. + if (VectorType *VTy = dyn_cast(Ty)) + if ((VTy->getElementCount().getKnownMinValue() != 2 && + VTy->getElementCount().getKnownMinValue() != 4) || + VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 || + !VTy->isScalableTy()) + return InstructionCost::getInvalid(); + + return InstructionCost(BaseHistCntCost); +} + InstructionCost AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2f44aaa3e26ab..303a3b41e0c29 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -118,6 +118,8 @@ class AArch64TTIImpl : public BasicTTIImplBase { return 31; } + InstructionCost getHistogramCost(Type *Ty) const; + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 9de49d1bcfeac..b1029ec60ad8b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1506,6 +1506,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } + for (const HistogramInfo &HGram : LAI->getHistograms()) { + Type *UpdateTy = HGram.Update->getType(); + if (!TTI->getHistogramCost(UpdateTy).isValid()) { + LLVM_DEBUG(dbgs() << "Invalid TTI Histogram Cost for type: " << *UpdateTy + << "\n"); + return false; + } + } + LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need ? " (with a runtime bound check)" diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9353666e417c8..391f143a0983e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5317,6 +5317,11 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; + if (!Legal->getLAI()->getHistograms().empty()) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving histogram operations.\n"); + return 1; + } + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); @@ -6939,8 +6944,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // We've proven all lanes safe to speculate, fall through. [[fallthrough]]; case Instruction::Add: - case Instruction::FAdd: case Instruction::Sub: + // FIXME: multiply cost too, if needed. + if (Legal->isHistogramLoadOrUpdate(I)) + return TTI.getHistogramCost(VectorTy) + + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); + [[fallthrough]]; + case Instruction::FAdd: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: @@ -8406,6 +8416,34 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, }; } +VPHistogramRecipe * +VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, + ArrayRef Operands) { + // FIXME: Support other operations. + assert(HI->Update->getOpcode() == Instruction::Add && + "Histogram update operation must be an Add"); + + SmallVector HGramOps; + // Bucket address. + HGramOps.push_back(Operands[1]); + // Increment value. + HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1), Plan)); + + // In case of predicated execution (due to tail-folding, or conditional + // execution, or both), pass the relevant mask. When there is no such mask, + // generate an all-true mask. + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(HI->Store)) + Mask = getBlockInMask(HI->Store->getParent()); + else + Mask = Plan.getOrAddLiveIn( + ConstantInt::getTrue(IntegerType::getInt1Ty(HI->Load->getContext()))); + HGramOps.push_back(Mask); + + return new VPHistogramRecipe(make_range(HGramOps.begin(), HGramOps.end()), + HI->Store->getDebugLoc()); +} + void VPRecipeBuilder::fixHeaderPhis() { BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); for (VPHeaderPHIRecipe *R : PhisToFix) { @@ -8523,6 +8561,12 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto *CI = dyn_cast(Instr)) return tryToWidenCall(CI, Operands, Range); + if (StoreInst *SI = dyn_cast(Instr)) { + std::optional HI = Legal->getHistogramForStore(SI); + if (HI) + return tryToWidenHistogram(*HI, Operands); + } + if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); @@ -8723,6 +8767,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Operands = {OpRange.begin(), OpRange.end()}; } + // If this is a load instruction or a binop associated with a histogram, + // leave it until the store instruction to emit a combined intrinsic. + if (Legal->isHistogramLoadOrUpdate(Instr)) + continue; + // Invariant stores inside loop will be deleted and a single store // with the final reduction value will be added to the exit block StoreInst *SI; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index b4c7ab02f928f..2e0139da4668a 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -102,6 +102,12 @@ class VPRecipeBuilder { VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef Operands, VPBasicBlock *VPBB); + /// Makes Histogram count operations safe for vectorization, by emitting a + /// Histogram LLVM Intrinsic before the BinOp (Add/Sub) that does the actual + /// counting. + VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, + ArrayRef Operands); + public: VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, LoopVectorizationLegality *Legal, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4b3cb15b5e1e6..3c0be6adb8826 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -871,6 +871,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: case VPRecipeBase::VPWidenStoreSC: + case VPRecipeBase::VPHistogramSC: // TODO: Widened stores don't define a value, but widened loads do. Split // the recipes to be able to make widened loads VPSingleDefRecipes. return false; @@ -1500,6 +1501,30 @@ class VPWidenCallRecipe : public VPSingleDefRecipe { #endif }; +class VPHistogramRecipe : public VPRecipeBase { +public: + template + VPHistogramRecipe(iterator_range Operands, DebugLoc DL = {}) + : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL) {} + + ~VPHistogramRecipe() override = default; + + VPHistogramRecipe *clone() override { + llvm_unreachable("cloning not supported"); + } + + VP_CLASSOF_IMPL(VPDef::VPHistogramSC); + + // Produce a histogram operation with widened ingredients + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for widening select instructions. struct VPWidenSelectRecipe : public VPSingleDefRecipe { template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fa634e774b5cd..62e6810a205c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" @@ -811,6 +812,37 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, O << ")"; } } +#endif + +void VPHistogramRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Tried interleaving histogram operation"); + State.setDebugLocFrom(getDebugLoc()); + IRBuilderBase &Builder = State.Builder; + Value *Address = State.get(getOperand(0), 0); + Value *IncVec = State.get(getOperand(1), 0); + Value *Mask = State.get(getOperand(2), 0); + + // Not sure how to make IncAmt stay scalar yet. For now just extract the + // first element and tidy up later. + // FIXME: Do we actually want this to be scalar? We just splat it in the + // backend anyway... + Value *IncAmt = Builder.CreateExtractElement(IncVec, Builder.getInt64(0)); + + State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, + {Address->getType(), IncAmt->getType()}, + {Address, IncAmt, Mask}); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-HISTOGRAM buckets: "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << ", inc: "; + getOperand(1)->printAsOperand(O, SlotTracker); + O << ", mask: "; + getOperand(2)->printAsOperand(O, SlotTracker); +} void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 96d04271850f7..1d4fe1e28b45d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -363,6 +363,7 @@ class VPDef { VPWidenSC, VPWidenSelectSC, VPBlendSC, + VPHistogramSC, // START: Phi-like recipes. Need to be kept together. VPWidenPHISC, VPPredInstPHISC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 02923cfbb8d00..bd37fdd7d9723 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -14,19 +14,48 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-LABEL: define void @simple_histogram( ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], [[TMP9]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP10]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ;