-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Autovectorization for the all-in-one histogram intrinsic #91458
[LV] Autovectorization for the all-in-one histogram intrinsic #91458
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-llvm-ir Author: Graham Hunter (huntergr-arm) ChangesThis patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches. This patch currently contains parts of #88106 in order to demonstrate it working, but once the patch lands this will be rebased on top of that. A proof-of-concept for autovec for the 'standalone' intrinsic can be found here to compare the differences for LoopVectorize: #89366 Patch is 51.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91458.diff 27 Files Affected:
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cf998e66ee486..e9b47640f6c26 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -198,7 +198,8 @@ class MemoryDepChecker {
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects);
+ &UnderlyingObjects,
+ const SmallPtrSetImpl<const Value *> &HistogramPtrs);
/// No memory dependence was encountered that would inhibit
/// vectorization.
@@ -330,7 +331,8 @@ class MemoryDepChecker {
isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects);
+ &UnderlyingObjects,
+ const SmallPtrSetImpl<const Value *> &HistogramPtrs);
/// Check whether the data dependence could prevent store-load
/// forwarding.
@@ -394,6 +396,15 @@ struct PointerDiffInfo {
NeedsFreeze(NeedsFreeze) {}
};
+struct HistogramInfo {
+ Instruction *Load;
+ Instruction *Update;
+ Instruction *Store;
+
+ HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
+ : Load(Load), Update(Update), Store(Store) {}
+};
+
/// Holds information about the memory runtime legality checks to verify
/// that a group of pointers do not overlap.
class RuntimePointerChecking {
@@ -612,6 +623,10 @@ class LoopAccessInfo {
unsigned getNumStores() const { return NumStores; }
unsigned getNumLoads() const { return NumLoads;}
+ const SmallVectorImpl<HistogramInfo> &getHistograms() const {
+ return Histograms;
+ }
+
/// The diagnostics report generated for the analysis. E.g. why we
/// couldn't analyze the loop.
const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -724,6 +739,13 @@ class LoopAccessInfo {
/// If an access has a symbolic strides, this maps the pointer value to
/// the stride symbol.
DenseMap<Value *, const SCEV *> SymbolicStrides;
+
+ /// Holds the load, update, and store instructions for all histogram-style
+ /// operations found in the loop.
+ SmallVector<HistogramInfo, 2> Histograms;
+
+ /// Storing Histogram Pointers
+ SmallPtrSet<const Value *, 2> HistogramPtrs;
};
/// Return the SCEV corresponding to a pointer with the symbolic stride
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1c76821fe5e4a..d104bca64ce39 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -982,6 +982,9 @@ class TargetTransformInfo {
/// Return hardware support for population count.
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+ /// Returns the cost of generating a vector histogram.
+ InstructionCost getHistogramCost(Type *Ty) const;
+
/// Return true if the hardware has a fast square-root instruction.
bool haveFastSqrt(Type *Ty) const;
@@ -1930,6 +1933,7 @@ class TargetTransformInfo::Concept {
unsigned *Fast) = 0;
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
virtual bool haveFastSqrt(Type *Ty) = 0;
+ virtual InstructionCost getHistogramCost(Type *Ty) = 0;
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
@@ -2490,6 +2494,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
}
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
+ InstructionCost getHistogramCost(Type *Ty) override {
+ return Impl.getHistogramCost(Ty);
+ }
+
bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
return Impl.isExpensiveToSpeculativelyExecute(I);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4d5cd963e0926..43fc2de02c85b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -412,6 +412,10 @@ class TargetTransformInfoImplBase {
bool haveFastSqrt(Type *Ty) const { return false; }
+ InstructionCost getHistogramCost(Type *Ty) const {
+ return InstructionCost::getInvalid();
+ }
+
bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c6e90e57e46ed..52f80d58a5498 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -538,6 +538,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}
+ InstructionCost getHistogramCost(Type *Ty) {
+ return InstructionCost::getInvalid();
+ }
+
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
return true;
}
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6429947958ee9..c8aa9a81c6c0f 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1402,6 +1402,11 @@ enum NodeType {
// which is later translated to an implicit use in the MIR.
CONVERGENCECTRL_GLUE,
+ // Experimental vector histogram intrinsic
+ // Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID
+ // Output: Output Chain
+ EXPERIMENTAL_HISTOGRAM,
+
/// BUILTIN_OP_END - This must be the last enum value in this list.
/// The target-specific pre-isel opcode values start here.
BUILTIN_OP_END
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4b1b58d4af0bb..71bf76c93e478 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1526,6 +1526,9 @@ class SelectionDAG {
ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
ISD::MemIndexType IndexType,
bool IsTruncating = false);
+ SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+ ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+ ISD::MemIndexType IndexType);
SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
MachineMemOperand *MMO);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index e7c7104145455..ce1c012470dc4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
friend class MaskedLoadStoreSDNode;
friend class MaskedGatherScatterSDNode;
friend class VPGatherScatterSDNode;
+ friend class MaskedHistogramSDNode;
uint16_t : NumMemSDNodeBits;
@@ -564,6 +565,7 @@ BEGIN_TWO_BYTE_PACK()
friend class MaskedLoadSDNode;
friend class MaskedGatherSDNode;
friend class VPGatherSDNode;
+ friend class MaskedHistogramSDNode;
uint16_t : NumLSBaseSDNodeBits;
@@ -1420,6 +1422,7 @@ class MemSDNode : public SDNode {
return getOperand(2);
case ISD::MGATHER:
case ISD::MSCATTER:
+ case ISD::EXPERIMENTAL_HISTOGRAM:
return getOperand(3);
default:
return getOperand(1);
@@ -1468,6 +1471,7 @@ class MemSDNode : public SDNode {
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
case ISD::GET_FPENV_MEM:
case ISD::SET_FPENV_MEM:
+ case ISD::EXPERIMENTAL_HISTOGRAM:
return true;
default:
return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2953,6 +2957,33 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
}
};
+class MaskedHistogramSDNode : public MemSDNode {
+public:
+ friend class SelectionDAG;
+
+ MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+ EVT MemVT, MachineMemOperand *MMO,
+ ISD::MemIndexType IndexType)
+ : MemSDNode(ISD::EXPERIMENTAL_HISTOGRAM, Order, DL, VTs, MemVT, MMO) {
+ LSBaseSDNodeBits.AddressingMode = IndexType;
+ }
+
+ ISD::MemIndexType getIndexType() const {
+ return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+ }
+
+ const SDValue &getBasePtr() const { return getOperand(3); }
+ const SDValue &getIndex() const { return getOperand(4); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getScale() const { return getOperand(5); }
+ const SDValue &getInc() const { return getOperand(1); }
+ const SDValue &getIntID() const { return getOperand(6); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == ISD::EXPERIMENTAL_HISTOGRAM;
+ }
+};
+
class FPStateAccessSDNode : public MemSDNode {
public:
friend class SelectionDAG;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 29143123193b9..138703bf3d98f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1856,6 +1856,13 @@ def int_experimental_vp_strided_load : DefaultAttrsIntrinsic<[llvm_anyvector_ty
llvm_i32_ty],
[ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
+// Experimental histogram
+def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Increment
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ []>;
+
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b..3ac73232c0f7b 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -387,6 +387,23 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
+ bool isHistogramLoadOrUpdate(Instruction *I) const {
+ for (const HistogramInfo &HGram : LAI->getHistograms())
+ if (HGram.Load == I || HGram.Update == I)
+ return true;
+
+ return false;
+ }
+
+ std::optional<const HistogramInfo *>
+ getHistogramForStore(StoreInst *SI) const {
+ for (const HistogramInfo &HGram : LAI->getHistograms())
+ if (HGram.Store == SI)
+ return &HGram;
+
+ return std::nullopt;
+ }
+
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
return &PSE;
}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index ae7f0373c4e84..2b3a44fdc7d23 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -21,6 +21,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
@@ -69,6 +70,8 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "loop-accesses"
+STATISTIC(HistogramsDetected, "Number of Histograms detected");
+
static cl::opt<unsigned, true>
VectorizationFactor("force-vector-width", cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."),
@@ -730,6 +733,23 @@ class AccessAnalysis {
return UnderlyingObjects;
}
+ /// Find Histogram counts that match high-level code in loops:
+ /// \code
+ /// buckets[indices[i]]+=step;
+ /// \endcode
+ ///
+ /// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
+ /// array the computed histogram. It uses a BinOp to sum all counts, storing
+ /// them using a loop-variant index Load from the 'indices' input array.
+ ///
+ /// On successful matches it updates the STATISTIC 'HistogramsDetected',
+ /// regardless of hardware support. When there is support, it additionally
+ /// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+ /// used to update histogram in \p HistogramPtrs.
+ void findHistograms(StoreInst *HSt,
+ SmallVectorImpl<HistogramInfo> &Histograms,
+ SmallPtrSetImpl<const Value *> &HistogramPtrs);
+
private:
typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
@@ -1947,7 +1967,8 @@ getDependenceDistanceStrideAndSize(
const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
- PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
+ PredicatedScalarEvolution &PSE, const Loop *InnermostLoop,
+ const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
auto &SE = *PSE.getSE();
auto [APtr, AIsWrite] = A;
@@ -1965,6 +1986,15 @@ getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;
+ // Ignore Histogram count updates as they are handled by the Intrinsic. This
+ // happens when the same pointer is first used to read from and then is used
+ // to write to.
+ if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) {
+ LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: "
+ << *APtr);
+ return MemoryDepChecker::Dependence::NoDep;
+ }
+
int64_t StrideAPtr =
getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
int64_t StrideBPtr =
@@ -2018,15 +2048,15 @@ getDependenceDistanceStrideAndSize(
MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects) {
+ const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
+ const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
assert(AIdx < BIdx && "Must pass arguments in program order");
// Get the dependence distance, stride, type size and what access writes for
// the dependence between A and B.
auto Res = getDependenceDistanceStrideAndSize(
A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
- InnermostLoop);
+ InnermostLoop, HistogramPtrs);
if (std::holds_alternative<Dependence::DepType>(Res))
return std::get<Dependence::DepType>(Res);
@@ -2240,8 +2270,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
bool MemoryDepChecker::areDepsSafe(
DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
const DenseMap<Value *, const SCEV *> &Strides,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects) {
+ const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
+ const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
MinDepDistBytes = -1;
SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2286,7 +2316,7 @@ bool MemoryDepChecker::areDepsSafe(
Dependence::DepType Type =
isDependent(*A.first, A.second, *B.first, B.second, Strides,
- UnderlyingObjects);
+ UnderlyingObjects, HistogramPtrs);
mergeInStatus(Dependence::isSafeForVectorization(Type));
// Gather dependences unless we accumulated MaxDependences
@@ -2622,6 +2652,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
// check.
Accesses.buildDependenceSets();
+ for (StoreInst *ST : Stores)
+ Accesses.findHistograms(ST, Histograms, HistogramPtrs);
+
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
Value *UncomputablePtr = nullptr;
@@ -2646,7 +2679,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
CanVecMem = DepChecker->areDepsSafe(
DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
- Accesses.getUnderlyingObjects());
+ Accesses.getUnderlyingObjects(), HistogramPtrs);
if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -3084,6 +3117,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
return *I.first->second;
}
+void AccessAnalysis::findHistograms(
+ StoreInst *HSt, SmallVectorImpl<HistogramInfo> &Histograms,
+ SmallPtrSetImpl<const Value *> &HistogramPtrs) {
+ LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt
+ << "\n");
+ // Store value must come from a Binary Operation.
+ Instruction *HPtrInstr = nullptr;
+ BinaryOperator *HBinOp = nullptr;
+ if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) {
+ LLVM_DEBUG(dbgs() << "\tNo BinOp\n");
+ return;
+ }
+
+ // BinOp must be an Add or a Sub operating modifying the bucket value by a
+ // loop invariant amount.
+ // FIXME: We assume the loop invariant term is on the RHS.
+ // Fine for an immediate/constant, but maybe not a generic value?
+ Value *HIncVal = nullptr;
+ if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) {
+ LLVM_DEBUG(dbgs() << "\tNo matching load\n");
+ return;
+ }
+ Instruction *IndexedLoad = cast<Instruction>(HBinOp->getOperand(0));
+
+ // The address to store is calculated through a GEP Instruction.
+ // FIXME: Support GEPs with more operands.
+ GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
+ if (!HPtr || HPtr->getNumOperands() > 2) {
+ LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n");
+ return;
+ }
+
+ // Check that the index is calculated by loading from another array. Ignore
+ // any extensions.
+ // FIXME: Support indices from other sources that a linear load from memory?
+ Value *HIdx = HPtr->getOperand(1);
+ Instruction *IdxInst = nullptr;
+ // FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to
+ // look through extensions, find another way?
+ if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
+ return;
+
+ // Currently restricting this to linear addressing when loading indices.
+ LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
+ Value *VPtrVal;
+ if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) {
+ LLVM_DEBUG(dbgs() << "\tBad Index Load\n");
+ return;
+ }
+
+ if (!isa<SCEVAddRecExpr>(PSE.getSCEV(VPtrVal))) {
+ LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n");
+ return;
+ }
+
+ // FIXME: support smaller types of input arrays. Integers can be promoted
+ // for codegen.
+ Type *VLoadTy = VLoad->getType();
+ if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 &&
+ VLoadTy->getScalarSizeInBits() != 64)) {
+ LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n");
+ return;
+ }
+
+ // Ensure we'll have the same mask by checking that all parts of the histogram
+ // are in the same block.
+ // FIXME: Could use dominance checks instead?
+ if (IndexedLoad->getParent() != HBi...
[truncated]
|
a10ce1a
to
22f9ebd
Compare
Rebased on top of the committed version of the intrinsic. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some valid points raised with FIXME comments.
@@ -0,0 +1,82 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 | |||
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eventually we want this to be controlled using only -aarch64-base-histcnt-cost=1
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can add a new flag for histcnt cost, but it's actually the gathers and scatters which prevent vectorization here. I've changed to use -sve-gather-overhead and -sve-scatter-overhead instead.
I do think we've made gathers and scatters too expensive by default for SVE, but we'll need to spend some time figuring out which loops get worse with it on and working out a better model.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, sounds reasonable. Thanks.
22f9ebd
to
a965c25
Compare
Added more tests, made histogram a separate dependency type, added support for transforming sub updates. Lots of cleanup. |
Ping? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the changes, look good. Added a couple of 'nits'.
@@ -0,0 +1,82 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 | |||
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, sounds reasonable. Thanks.
a965c25
to
3e7b720
Compare
Rebased, improved comments. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the changes.
@@ -1939,6 +1942,7 @@ class TargetTransformInfo::Concept { | |||
unsigned *Fast) = 0; | |||
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; | |||
virtual bool haveFastSqrt(Type *Ty) = 0; | |||
virtual InstructionCost getHistogramCost(Type *Ty) = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need a new hook for that or would it be sufficient to implement this in getIntrinsicInstrCost
? That way the cost changes could also be split off and tested separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I was looking for a method taking a Type and didn't see one. But the struct that method takes does include a Type, so I think it should be straightforward to use that. Will split off the cost part into a separate patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Committed in #97397
We may need a separate method at some point anyway to determine likely addressing modes, but AArch64 currently doesn't do that for gathers and scatters anyway. Something to do after the cost model refactoring to generate the cost from the plan.
BackwardVectorizableButPreventsForwarding | ||
BackwardVectorizableButPreventsForwarding, | ||
// Access is to a loop loaded value, but is part of a histogram operation. | ||
Histogram |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, IIUC this will change the meaning LoopAccessInfo::canVectorizeMemory to can vectorize memory, if you guarantee to generate histogram intrinsics for all identified histograms?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Would you prefer a second method to indicate that it's safe to vectorize with histograms? Or an argument or flag to allow it?
I've noticed that decision making has been migrating to using the cost model instead of outright rejecting loops in legality or LAA, so structured it that way. We can scalarize the intrinsic for targets without appropriate instructions (implemented in ScalarizeMaskedMemIntrin.cpp), so it seems safe enough for LoopVectorize.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added a separate interface (canVectorizeMemoryWithHistogram), along with a flag to enable vectorization of histograms. Currently off by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm but the current version still has to analyze all stores for histograms even if the option is off?. Just trying to see if there are alternatives to avoid pulling some very specific analysis logic into LAA.
IIUC without the histogram analysis, we would have an unknown dependence, that LV could analyze separately if histograms are supported, like other passes like LoopDistribute or LoopLoadeleimination do?
Would be great if it would be possible to support this without needing to increase the complexity of dependence analysis in LAA
Adds a separate interface to distinguish between the existing semantics of canVectorizeMemory(), and the ability to do so if the transform pass can handle histograms. Also adds a flag to enable histogram support in LoopVecLegality, defaulting to off for now.
3e7b720
to
0d71291
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/1252 Here is the relevant piece of the build log for the reference:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding some more comments inline, I think the current version is introducing crashes in a number of cases (when unrolling and possibly epilogue vectorization).
It would also be good to add additional test coverage, including missing negative tests for the memory analysis, SVE-independent tests and a test to check VPlan printing.
As there are a number of potential issues, it might be good to revert while addressing those
BackwardVectorizableButPreventsForwarding | ||
BackwardVectorizableButPreventsForwarding, | ||
// Access is to a loop loaded value, but is part of a histogram operation. | ||
Histogram |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm but the current version still has to analyze all stores for histograms even if the option is off?. Just trying to see if there are alternatives to avoid pulling some very specific analysis logic into LAA.
IIUC without the histogram analysis, we would have an unknown dependence, that LV could analyze separately if histograms are supported, like other passes like LoopDistribute or LoopLoadeleimination do?
Would be great if it would be possible to support this without needing to increase the complexity of dependence analysis in LAA
~VPHistogramRecipe() override = default; | ||
|
||
VPHistogramRecipe *clone() override { | ||
llvm_unreachable("cloning not supported"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can histograms never occur in epilog vectorized loops?
@@ -1517,6 +1518,35 @@ class VPWidenCallRecipe : public VPSingleDefRecipe { | |||
#endif | |||
}; | |||
|
|||
class VPHistogramRecipe : public VPRecipeBase { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doc comment missing?
@@ -888,6 +888,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { | |||
case VPRecipeBase::VPWidenLoadSC: | |||
case VPRecipeBase::VPWidenStoreEVLSC: | |||
case VPRecipeBase::VPWidenStoreSC: | |||
case VPRecipeBase::VPHistogramSC: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
marked as VPSingleDefRecipe here but not inheriting from VPSingleDefRecipe?
#endif | ||
|
||
void VPHistogramRecipe::execute(VPTransformState &State) { | ||
assert(State.UF == 1 && "Tried interleaving histogram operation"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What prevents the recipe to be created only in plans that do not require interleaving? (I suspect your the sve2-histcount.ll`. test to crash when interleave count = 2 is forced (with SVE disabled)
@@ -6771,8 +6776,33 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, | |||
// We've proven all lanes safe to speculate, fall through. | |||
[[fallthrough]]; | |||
case Instruction::Add: | |||
case Instruction::Sub: { | |||
auto Info = Legal->getHistogramInfo(I); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT we are still adding the costs for the load/store replaced by the histogram?
/// | ||
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets' | ||
/// array the computed histogram. It uses a BinOp to sum all counts, storing | ||
/// them using a loop-variant index Load from the 'indices' input array. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to also document the restrictions w.r.t. to other accesses.
O << ", inc: "; | ||
getOperand(1)->printAsOperand(O, SlotTracker); | ||
O << ", mask: "; | ||
getOperand(2)->printAsOperand(O, SlotTracker); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test missing?
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/594 Here is the relevant piece of the build log for the reference:
|
…1458) This patch implements limited loop vectorization support for the 'all-in-one' histogram intrinsic. The feature is disabled by default, and when enabled will only vectorize if there are no other users of values in the gather-modify-scatter sequence.
llvm#98493) Reverts llvm#91458 to deal with post-commit reviewer requests.
This patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches.
This patch currently contains parts of #88106 in order to demonstrate it working, but once the patch lands this will be rebased on top of that.
A proof-of-concept for autovec for the 'standalone' intrinsic can be found here to compare the differences for LoopVectorize: #89366