Skip to content

Commit

Permalink
Explicitly stage strided loads (halide#7230)
Browse files Browse the repository at this point in the history
* Add a pass to do explicit densification of strided loads

* densify more types of strided load

* Reorder downsample in local laplacian for slightly better performance

* Move allocation padding into the IR. Still WIP.

* Simplify concat_bits handling

* Use evidence from parent scopes to densify

* Disallow padding allocations with custom new expressions

* Add test for parent scopes

* Remove debugging prints. Avoid nested ramps.

* Avoid parent scope loops

* Update cmakefiles

* Fix for large_buffers

* Pad stack allocations too

* Restore vld2/3/4 generation on non-Apple ARM chips

* Appease clang-format and clang-tidy

* Silence clang-tidy

* Better comments

* Comment improvements

* Nuke code that reads out of bounds

* Fix stage_strided_loads test

* Change strategy for loads from external buffers

Some backends don't like non-power-of-two vectors. Do two overlapping
half-sized loads and shuffle instead of one funny-sized load.

* Add explanatory comment to ARM backend

* Fix cpp backend shuffling

* Fix missing msan annotations

* Magnify heap cost effect in stack_vs_heap performance test

* Address review comments

* clang-tidy

* Fix for when same load node occurs in two different allocate nodes
  • Loading branch information
abadams authored and ardier committed Mar 3, 2024
1 parent 056a1c5 commit 15de537
Show file tree
Hide file tree
Showing 41 changed files with 1,004 additions and 210 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ SOURCE_FILES = \
Solve.cpp \
SpirvIR.cpp \
SplitTuples.cpp \
StageStridedLoads.cpp \
StmtToHtml.cpp \
StorageFlattening.cpp \
StorageFolding.cpp \
Expand Down Expand Up @@ -725,6 +726,7 @@ HEADER_FILES = \
SlidingWindow.h \
Solve.h \
SplitTuples.h \
StageStridedLoads.h \
StmtToHtml.h \
StorageFlattening.h \
StorageFolding.h \
Expand Down
6 changes: 3 additions & 3 deletions apps/local_laplacian/local_laplacian_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
Func downsample(Func f) {
using Halide::_;
Func downx, downy;
downx(x, y, _) = (f(2 * x - 1, y, _) + 3.0f * (f(2 * x, y, _) + f(2 * x + 1, y, _)) + f(2 * x + 2, y, _)) / 8.0f;
downy(x, y, _) = (downx(x, 2 * y - 1, _) + 3.0f * (downx(x, 2 * y, _) + downx(x, 2 * y + 1, _)) + downx(x, 2 * y + 2, _)) / 8.0f;
return downy;
downy(x, y, _) = (f(x, 2 * y - 1, _) + 3.0f * (f(x, 2 * y, _) + f(x, 2 * y + 1, _)) + f(x, 2 * y + 2, _)) / 8.0f;
downx(x, y, _) = (downy(2 * x - 1, y, _) + 3.0f * (downy(2 * x, y, _) + downy(2 * x + 1, y, _)) + downy(2 * x + 2, y, _)) / 8.0f;
return downx;
}

// Upsample using bilinear interpolation
Expand Down
3 changes: 2 additions & 1 deletion src/AddAtomicMutex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,8 @@ class AddAtomicMutex : public IRMutator {
op->condition,
std::move(body),
op->new_expr,
op->free_function);
op->free_function,
op->padding);
}

Stmt visit(const ProducerConsumer *op) override {
Expand Down
2 changes: 1 addition & 1 deletion src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class NoOpCollapsingMutator : public IRMutator {
} else {
return Allocate::make(op->name, op->type, op->memory_type,
op->extents, op->condition, body,
op->new_expr, op->free_function);
op->new_expr, op->free_function, op->padding);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/BoundSmallAllocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class BoundSmallAllocations : public IRMutator {
if (size_ptr && size == 0 && !op->new_expr.defined()) {
// This allocation is dead
return Allocate::make(op->name, op->type, op->memory_type, {0}, const_false(),
mutate(op->body), op->new_expr, op->free_function);
mutate(op->body), op->new_expr, op->free_function, op->padding);
}

// 128 bytes is a typical minimum allocation size in
Expand All @@ -155,7 +155,7 @@ class BoundSmallAllocations : public IRMutator {
user_assert(size >= 0 && size < (int64_t)1 << 31)
<< "Allocation " << op->name << " has a size greater than 2^31: " << bound << "\n";
return Allocate::make(op->name, op->type, op->memory_type, {(int32_t)size}, op->condition,
mutate(op->body), op->new_expr, op->free_function);
mutate(op->body), op->new_expr, op->free_function, op->padding);
} else {
return IRMutator::visit(op);
}
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ set(HEADER_FILES
SlidingWindow.h
Solve.h
SplitTuples.h
StageStridedLoads.h
StmtToHtml.h
StorageFlattening.h
StorageFolding.h
Expand Down Expand Up @@ -324,6 +325,7 @@ set(SOURCE_FILES
Solve.cpp
SpirvIR.cpp
SplitTuples.cpp
StageStridedLoads.cpp
StmtToHtml.cpp
StorageFlattening.cpp
StorageFolding.cpp
Expand Down
113 changes: 111 additions & 2 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Debug.h"
#include "IREquality.h"
#include "IRMatch.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "IRPrinter.h"
#include "LLVM_Headers.h"
Expand All @@ -30,6 +31,71 @@ using namespace llvm;

namespace {

// Substitute in loads that feed into slicing shuffles, to help with vld2/3/4
// emission. These are commonly lifted as lets because they get used by multiple
// interleaved slices of the same load.
class SubstituteInStridedLoads : public IRMutator {
Scope<Expr> loads;
std::map<std::string, std::vector<std::string>> vars_per_buffer;
std::set<std::string> poisoned_vars;

template<typename LetOrLetStmt>
auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
const Load *l = op->value.template as<Load>();
const Ramp *r = l ? l->index.as<Ramp>() : nullptr;
auto body = op->body;
if (r && is_const_one(r->stride)) {
ScopedBinding bind(loads, op->name, op->value);
vars_per_buffer[l->name].push_back(op->name);
body = mutate(op->body);
vars_per_buffer[l->name].pop_back();
poisoned_vars.erase(l->name);
} else {
body = mutate(op->body);
}

// Unconditionally preserve the let, because there may be unsubstituted uses of
// it. It'll get dead-stripped by LLVM if not.
return LetOrLetStmt::make(op->name, op->value, body);
}

Expr visit(const Let *op) override {
return visit_let(op);
}

Stmt visit(const LetStmt *op) override {
return visit_let(op);
}

// Avoid substituting a load over an intervening store
Stmt visit(const Store *op) override {
auto it = vars_per_buffer.find(op->name);
if (it != vars_per_buffer.end()) {
for (const auto &v : it->second) {
poisoned_vars.insert(v);
}
}
return IRMutator::visit(op);
}

Expr visit(const Shuffle *op) override {
int stride = op->slice_stride();
const Variable *var = op->vectors[0].as<Variable>();
if (var &&
poisoned_vars.count(var->name) == 0 &&
op->vectors.size() == 1 &&
2 <= stride && stride <= 4 &&
op->slice_begin() < stride &&
loads.contains(var->name)) {
return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes());
} else {
return IRMutator::visit(op);
}
}

using IRMutator::visit;
};

/** A code generator that emits ARM code from a given Halide stmt. */
class CodeGen_ARM : public CodeGen_Posix {
public:
Expand All @@ -42,7 +108,10 @@ class CodeGen_ARM : public CodeGen_Posix {
/** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
* takes one vector argument and splits it into two to call inner. */
llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);

void init_module() override;
void compile_func(const LoweredFunc &f,
const std::string &simple_name, const std::string &extern_name) override;

/** Nodes for which we want to emit specific neon intrinsics */
// @{
Expand All @@ -52,6 +121,7 @@ class CodeGen_ARM : public CodeGen_Posix {
void visit(const Max *) override;
void visit(const Store *) override;
void visit(const Load *) override;
void visit(const Shuffle *) override;
void visit(const Call *) override;
void visit(const LT *) override;
void visit(const LE *) override;
Expand Down Expand Up @@ -770,6 +840,22 @@ void CodeGen_ARM::init_module() {
}
}

void CodeGen_ARM::compile_func(const LoweredFunc &f,
const string &simple_name,
const string &extern_name) {

LoweredFunc func = f;

if (target.os != Target::IOS && target.os != Target::OSX) {
// Substitute in strided loads to get vld2/3/4 emission. We don't do it
// on Apple silicon, because doing a dense load and then shuffling is
// actually faster.
func.body = SubstituteInStridedLoads().mutate(func.body);
}

CodeGen_Posix::compile_func(func, simple_name, extern_name);
}

void CodeGen_ARM::visit(const Cast *op) {
if (!neon_intrinsics_disabled() && op->type.is_vector()) {
vector<Expr> matches;
Expand Down Expand Up @@ -1092,9 +1178,9 @@ void CodeGen_ARM::visit(const Load *op) {
return;
}

// If the stride is in [-1, 4], we can deal with that using vanilla codegen
// If the stride is in [-1, 1], we can deal with that using vanilla codegen
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
if (stride && (-1 <= stride->value && stride->value <= 4)) {
if (stride && (-1 <= stride->value && stride->value <= 1)) {
CodeGen_Posix::visit(op);
return;
}
Expand Down Expand Up @@ -1123,6 +1209,29 @@ void CodeGen_ARM::visit(const Load *op) {
CodeGen_Posix::visit(op);
}

void CodeGen_ARM::visit(const Shuffle *op) {
// For small strided loads on non-Apple hardware, we may want to use vld2,
// vld3, vld4, etc. These show up in the IR as slice shuffles of wide dense
// loads. LLVM expects the same. The base codegen class breaks the loads
// into native vectors, which triggers shuffle instructions rather than
// vld2, vld3, vld4. So here we explicitly do the load as a single big dense
// load.
int stride = op->slice_stride();
const Load *load = op->vectors[0].as<Load>();
if (target.os != Target::IOS && target.os != Target::OSX &&
load &&
op->vectors.size() == 1 &&
2 <= stride && stride <= 4 &&
op->slice_begin() < stride &&
load->type.lanes() == stride * op->type.lanes()) {

value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
value = shuffle_vectors(value, op->indices);
} else {
CodeGen_Posix::visit(op);
}
}

void CodeGen_ARM::visit(const Call *op) {
if (op->is_intrinsic(Call::sorted_avg)) {
value = codegen(halving_add(op->args[0], op->args[1]));
Expand Down
15 changes: 7 additions & 8 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,8 @@ class CppVectorOps {
}
}
template<int... Indices>
static Vec shuffle(const Vec &a) {
template<int... Indices, typename InputVec>
static Vec shuffle(const InputVec &a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
Vec r = { a[Indices]... };
return r;
Expand Down Expand Up @@ -1129,15 +1129,14 @@ class NativeVectorOps {
}
}
template<int... Indices>
static Vec shuffle(const Vec a) {
template<int... Indices, typename InputVec>
static Vec shuffle(const InputVec a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
#if __has_builtin(__builtin_shufflevector)
// Clang
// Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't
// be used, because it can't handle changing the number of vector
// lanes between input and output.
return __builtin_shufflevector(a, a, Indices...);
#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__)
// GCC
return __builtin_shuffle(a, NativeVector<int, sizeof...(Indices)>{Indices...});
#else
Vec r = { a[Indices]... };
return r;
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,

Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents,
std::move(new_condition), std::move(new_body),
std::move(new_new_expr), op->free_function);
std::move(new_new_expr), op->free_function, op->padding);

allocs.push_back(new_alloc);
replacements.erase(op->name);
Expand Down
20 changes: 9 additions & 11 deletions src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
void visit(const Allocate *) override;
///@}

/** We ask for an extra vector on each allocation to enable fast
* clamped ramp loads. */
int allocation_padding(Type type) const override {
return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8;
}

/** Call an LLVM intrinsic, potentially casting the operands to
* match the type of the function. */
///@{
Expand Down Expand Up @@ -123,7 +117,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
* list of its extents and its size. Fires a runtime assert
* (halide_error) if the size overflows 2^31 -1, the maximum
* positive number an int32_t can hold. */
llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents);
llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, int padding);

/** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */
llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255);
Expand Down Expand Up @@ -2099,7 +2093,8 @@ void CodeGen_Hexagon::visit(const Select *op) {
}

Value *CodeGen_Hexagon::codegen_cache_allocation_size(
const std::string &name, Type type, const std::vector<Expr> &extents) {
const std::string &name, Type type,
const std::vector<Expr> &extents, int padding) {
// Compute size from list of extents checking for overflow.

Expr overflow = make_zero(UInt(32));
Expand Down Expand Up @@ -2131,6 +2126,9 @@ Value *CodeGen_Hexagon::codegen_cache_allocation_size(
// is still an 8-bit number.
overflow = overflow | (total_size_hi >> 24);
}
int padding_bytes = padding * type.bytes();
overflow = overflow | (total_size + padding_bytes < total_size);
total_size += padding_bytes;

Expr max_size = make_const(UInt(32), target.maximum_buffer_size());
Expr size_check = (overflow == 0) && (total_size <= max_size);
Expand Down Expand Up @@ -2169,7 +2167,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
llvm_size = codegen(Expr(constant_bytes));
} else {
llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type,
alloc->extents);
alloc->extents, alloc->padding);
}

// Only allocate memory if the condition is true, otherwise 0.
Expand Down Expand Up @@ -2262,13 +2260,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
for (const auto &extent : alloc->extents) {
size *= extent;
}
size += allocation_padding(alloc->type);
size += alloc->padding * alloc->type.bytes();
Expr new_expr =
Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern);
string free_function = "halide_vtcm_free";
Stmt new_alloc = Allocate::make(
alloc->name, alloc->type, alloc->memory_type, alloc->extents,
alloc->condition, alloc->body, new_expr, free_function);
alloc->condition, alloc->body, new_expr, free_function, alloc->padding);
new_alloc.accept(this);
} else {
// For all other memory types
Expand Down
Loading

0 comments on commit 15de537

Please sign in to comment.