Skip to content

Commit

Permalink
WIP histogram autovec
Browse files Browse the repository at this point in the history
Mostly functioning all-in-one intrinsic autovec
  • Loading branch information
huntergr-arm committed May 14, 2024
1 parent 8ad2f17 commit 22f9ebd
Show file tree
Hide file tree
Showing 16 changed files with 386 additions and 18 deletions.
26 changes: 24 additions & 2 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,8 @@ class MemoryDepChecker {
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
&UnderlyingObjects);
&UnderlyingObjects,
const SmallPtrSetImpl<const Value *> &HistogramPtrs);

/// No memory dependence was encountered that would inhibit
/// vectorization.
Expand Down Expand Up @@ -338,7 +339,8 @@ class MemoryDepChecker {
isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
&UnderlyingObjects);
&UnderlyingObjects,
const SmallPtrSetImpl<const Value *> &HistogramPtrs);

/// Check whether the data dependence could prevent store-load
/// forwarding.
Expand Down Expand Up @@ -402,6 +404,15 @@ struct PointerDiffInfo {
NeedsFreeze(NeedsFreeze) {}
};

struct HistogramInfo {
Instruction *Load;
Instruction *Update;
Instruction *Store;

HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
: Load(Load), Update(Update), Store(Store) {}
};

/// Holds information about the memory runtime legality checks to verify
/// that a group of pointers do not overlap.
class RuntimePointerChecking {
Expand Down Expand Up @@ -621,6 +632,10 @@ class LoopAccessInfo {
unsigned getNumStores() const { return NumStores; }
unsigned getNumLoads() const { return NumLoads;}

const SmallVectorImpl<HistogramInfo> &getHistograms() const {
return Histograms;
}

/// The diagnostics report generated for the analysis. E.g. why we
/// couldn't analyze the loop.
const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
Expand Down Expand Up @@ -733,6 +748,13 @@ class LoopAccessInfo {
/// If an access has a symbolic strides, this maps the pointer value to
/// the stride symbol.
DenseMap<Value *, const SCEV *> SymbolicStrides;

/// Holds the load, update, and store instructions for all histogram-style
/// operations found in the loop.
SmallVector<HistogramInfo, 2> Histograms;

/// Storing Histogram Pointers
SmallPtrSet<const Value *, 2> HistogramPtrs;
};

/// Return the SCEV corresponding to a pointer with the symbolic stride
Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,9 @@ class TargetTransformInfo {
/// Return hardware support for population count.
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;

/// Returns the cost of generating a vector histogram.
InstructionCost getHistogramCost(Type *Ty) const;

/// Return true if the hardware has a fast square-root instruction.
bool haveFastSqrt(Type *Ty) const;

Expand Down Expand Up @@ -1934,6 +1937,7 @@ class TargetTransformInfo::Concept {
unsigned *Fast) = 0;
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
virtual bool haveFastSqrt(Type *Ty) = 0;
virtual InstructionCost getHistogramCost(Type *Ty) = 0;
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
Expand Down Expand Up @@ -2497,6 +2501,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
}
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }

InstructionCost getHistogramCost(Type *Ty) override {
return Impl.getHistogramCost(Ty);
}

bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
return Impl.isExpensiveToSpeculativelyExecute(I);
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,10 @@ class TargetTransformInfoImplBase {

bool haveFastSqrt(Type *Ty) const { return false; }

InstructionCost getHistogramCost(Type *Ty) const {
return InstructionCost::getInvalid();
}

bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }

bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}

InstructionCost getHistogramCost(Type *Ty) {
return InstructionCost::getInvalid();
}

bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
return true;
}
Expand Down
17 changes: 17 additions & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,23 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }

bool isHistogramLoadOrUpdate(Instruction *I) const {
for (const HistogramInfo &HGram : LAI->getHistograms())
if (HGram.Load == I || HGram.Update == I)
return true;

return false;
}

std::optional<const HistogramInfo *>
getHistogramForStore(StoreInst *SI) const {
for (const HistogramInfo &HGram : LAI->getHistograms())
if (HGram.Store == SI)
return &HGram;

return std::nullopt;
}

PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
return &PSE;
}
Expand Down
142 changes: 134 additions & 8 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
Expand Down Expand Up @@ -70,6 +71,8 @@ using namespace llvm::PatternMatch;

#define DEBUG_TYPE "loop-accesses"

STATISTIC(HistogramsDetected, "Number of Histograms detected");

static cl::opt<unsigned, true>
VectorizationFactor("force-vector-width", cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."),
Expand Down Expand Up @@ -731,6 +734,23 @@ class AccessAnalysis {
return UnderlyingObjects;
}

/// Find Histogram counts that match high-level code in loops:
/// \code
/// buckets[indices[i]]+=step;
/// \endcode
///
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
/// array the computed histogram. It uses a BinOp to sum all counts, storing
/// them using a loop-variant index Load from the 'indices' input array.
///
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
/// regardless of hardware support. When there is support, it additionally
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
/// used to update histogram in \p HistogramPtrs.
void findHistograms(StoreInst *HSt,
SmallVectorImpl<HistogramInfo> &Histograms,
SmallPtrSetImpl<const Value *> &HistogramPtrs);

private:
typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;

Expand Down Expand Up @@ -1948,7 +1968,8 @@ getDependenceDistanceStrideAndSize(
const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop,
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
auto &SE = *PSE.getSE();
auto [APtr, AIsWrite] = A;
Expand All @@ -1966,6 +1987,15 @@ getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;

// Ignore Histogram count updates as they are handled by the Intrinsic. This
// happens when the same pointer is first used to read from and then is used
// to write to.
if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) {
LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: "
<< *APtr);
return MemoryDepChecker::Dependence::NoDep;
}

int64_t StrideAPtr =
getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
int64_t StrideBPtr =
Expand Down Expand Up @@ -2022,15 +2052,15 @@ getDependenceDistanceStrideAndSize(
MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
&UnderlyingObjects) {
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
assert(AIdx < BIdx && "Must pass arguments in program order");

// Get the dependence distance, stride, type size and what access writes for
// the dependence between A and B.
auto Res = getDependenceDistanceStrideAndSize(
A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
InnermostLoop);
InnermostLoop, HistogramPtrs);
if (std::holds_alternative<Dependence::DepType>(Res))
return std::get<Dependence::DepType>(Res);

Expand Down Expand Up @@ -2266,8 +2296,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
bool MemoryDepChecker::areDepsSafe(
DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
const DenseMap<Value *, const SCEV *> &Strides,
const DenseMap<Value *, SmallVector<const Value *, 16>>
&UnderlyingObjects) {
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {

MinDepDistBytes = -1;
SmallPtrSet<MemAccessInfo, 8> Visited;
Expand Down Expand Up @@ -2312,7 +2342,7 @@ bool MemoryDepChecker::areDepsSafe(

Dependence::DepType Type =
isDependent(*A.first, A.second, *B.first, B.second, Strides,
UnderlyingObjects);
UnderlyingObjects, HistogramPtrs);
mergeInStatus(Dependence::isSafeForVectorization(Type));

// Gather dependences unless we accumulated MaxDependences
Expand Down Expand Up @@ -2648,6 +2678,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
// check.
Accesses.buildDependenceSets();

for (StoreInst *ST : Stores)
Accesses.findHistograms(ST, Histograms, HistogramPtrs);

// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
Value *UncomputablePtr = nullptr;
Expand All @@ -2672,7 +2705,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
CanVecMem = DepChecker->areDepsSafe(
DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
Accesses.getUnderlyingObjects());
Accesses.getUnderlyingObjects(), HistogramPtrs);

if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
Expand Down Expand Up @@ -3127,6 +3160,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
return *I.first->second;
}

void AccessAnalysis::findHistograms(
StoreInst *HSt, SmallVectorImpl<HistogramInfo> &Histograms,
SmallPtrSetImpl<const Value *> &HistogramPtrs) {
LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt
<< "\n");
// Store value must come from a Binary Operation.
Instruction *HPtrInstr = nullptr;
BinaryOperator *HBinOp = nullptr;
if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) {
LLVM_DEBUG(dbgs() << "\tNo BinOp\n");
return;
}

// BinOp must be an Add or a Sub operating modifying the bucket value by a
// loop invariant amount.
// FIXME: We assume the loop invariant term is on the RHS.
// Fine for an immediate/constant, but maybe not a generic value?
Value *HIncVal = nullptr;
if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
!match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) {
LLVM_DEBUG(dbgs() << "\tNo matching load\n");
return;
}
Instruction *IndexedLoad = cast<Instruction>(HBinOp->getOperand(0));

// The address to store is calculated through a GEP Instruction.
// FIXME: Support GEPs with more operands.
GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
if (!HPtr || HPtr->getNumOperands() > 2) {
LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n");
return;
}

// Check that the index is calculated by loading from another array. Ignore
// any extensions.
// FIXME: Support indices from other sources that a linear load from memory?
Value *HIdx = HPtr->getOperand(1);
Instruction *IdxInst = nullptr;
// FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to
// look through extensions, find another way?
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
return;

// Currently restricting this to linear addressing when loading indices.
LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
Value *VPtrVal;
if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) {
LLVM_DEBUG(dbgs() << "\tBad Index Load\n");
return;
}

if (!isa<SCEVAddRecExpr>(PSE.getSCEV(VPtrVal))) {
LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n");
return;
}

// FIXME: support smaller types of input arrays. Integers can be promoted
// for codegen.
Type *VLoadTy = VLoad->getType();
if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 &&
VLoadTy->getScalarSizeInBits() != 64)) {
LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n");
return;
}

// Ensure we'll have the same mask by checking that all parts of the histogram
// are in the same block.
// FIXME: Could use dominance checks instead?
if (IndexedLoad->getParent() != HBinOp->getParent() ||
IndexedLoad->getParent() != HSt->getParent()) {
LLVM_DEBUG(dbgs() << "\tDifferent parent blocks\n");
return;
}

// A histogram pointer may only alias to itself, and must only have two uses,
// the load and the store.
for (AliasSet &AS : AST)
if (AS.isMustAlias() || AS.isMayAlias())
if ((is_contained(AS.getPointers(), HPtr) && AS.size() > 1) ||
HPtr->getNumUses() != 2) {
LLVM_DEBUG(dbgs() << "\tAliasing problem\n");
return;
}

LLVM_DEBUG(dbgs() << "LAA: Found Histogram Operation: " << *HBinOp << "\n");
HistogramsDetected++;

// Store the operations that make up the histogram.
Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
// Store pointers used to write those counts in the computed histogram.
HistogramPtrs.insert(HPtr);
}

bool LoopAccessInfoManager::invalidate(
Function &F, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &Inv) {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
return TTIImpl->haveFastSqrt(Ty);
}

InstructionCost TargetTransformInfo::getHistogramCost(Type *Ty) const {
return TTIImpl->getHistogramCost(Ty);
}

bool TargetTransformInfo::isExpensiveToSpeculativelyExecute(
const Instruction *I) const {
return TTIImpl->isExpensiveToSpeculativelyExecute(I);
Expand Down
Loading

0 comments on commit 22f9ebd

Please sign in to comment.