Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explicitly stage strided loads #7230

Merged
merged 29 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1fd8bb0
Add a pass to do explicit densification of strided loads
abadams Dec 8, 2022
009018b
densify more types of strided load
abadams Dec 8, 2022
86961c9
Reorder downsample in local laplacian for slightly better performance
abadams Dec 9, 2022
b880336
Move allocation padding into the IR. Still WIP.
abadams Dec 9, 2022
331e43f
Simplify concat_bits handling
abadams Dec 9, 2022
ffe6f0a
Use evidence from parent scopes to densify
abadams Dec 9, 2022
964c940
Disallow padding allocations with custom new expressions
abadams Dec 9, 2022
523dc69
Add test for parent scopes
abadams Dec 9, 2022
0260fb9
Remove debugging prints. Avoid nested ramps.
abadams Dec 9, 2022
d856df3
Avoid parent scope loops
abadams Dec 9, 2022
aa92026
Update cmakefiles
abadams Dec 9, 2022
69b486d
Fix for large_buffers
abadams Dec 9, 2022
37b1cc5
Pad stack allocations too
abadams Dec 9, 2022
05733e7
Restore vld2/3/4 generation on non-Apple ARM chips
abadams Dec 10, 2022
b9e7417
Appease clang-format and clang-tidy
abadams Dec 12, 2022
ae0d0d8
Silence clang-tidy
abadams Dec 12, 2022
0d977e7
Better comments
abadams Dec 12, 2022
96ac6c0
Comment improvements
abadams Dec 12, 2022
4f733aa
Nuke code that reads out of bounds
abadams Dec 13, 2022
ff9a1b6
Fix stage_strided_loads test
abadams Dec 13, 2022
8f22adb
Change strategy for loads from external buffers
abadams Dec 13, 2022
b1dd3a2
Add explanatory comment to ARM backend
abadams Dec 13, 2022
068412f
Fix cpp backend shuffling
abadams Dec 13, 2022
df3bf08
Fix missing msan annotations
abadams Dec 13, 2022
a1f2a12
Magnify heap cost effect in stack_vs_heap performance test
abadams Dec 13, 2022
f3a9e11
Merge branch 'abadams/stage_strided_loads' of https://github.com/hali…
abadams Dec 13, 2022
ee6cea3
Address review comments
abadams Dec 15, 2022
2f7bf16
clang-tidy
abadams Dec 15, 2022
16b90bb
Fix for when same load node occurs in two different allocate nodes
abadams Dec 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ SOURCE_FILES = \
Solve.cpp \
SpirvIR.cpp \
SplitTuples.cpp \
StageStridedLoads.cpp \
StmtToHtml.cpp \
StorageFlattening.cpp \
StorageFolding.cpp \
Expand Down Expand Up @@ -725,6 +726,7 @@ HEADER_FILES = \
SlidingWindow.h \
Solve.h \
SplitTuples.h \
StageStridedLoads.h \
StmtToHtml.h \
StorageFlattening.h \
StorageFolding.h \
Expand Down
6 changes: 3 additions & 3 deletions apps/local_laplacian/local_laplacian_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
Func downsample(Func f) {
using Halide::_;
Func downx, downy;
downx(x, y, _) = (f(2 * x - 1, y, _) + 3.0f * (f(2 * x, y, _) + f(2 * x + 1, y, _)) + f(2 * x + 2, y, _)) / 8.0f;
downy(x, y, _) = (downx(x, 2 * y - 1, _) + 3.0f * (downx(x, 2 * y, _) + downx(x, 2 * y + 1, _)) + downx(x, 2 * y + 2, _)) / 8.0f;
return downy;
downy(x, y, _) = (f(x, 2 * y - 1, _) + 3.0f * (f(x, 2 * y, _) + f(x, 2 * y + 1, _)) + f(x, 2 * y + 2, _)) / 8.0f;
downx(x, y, _) = (downy(2 * x - 1, y, _) + 3.0f * (downy(2 * x, y, _) + downy(2 * x + 1, y, _)) + downy(2 * x + 2, y, _)) / 8.0f;
return downx;
}

// Upsample using bilinear interpolation
Expand Down
3 changes: 2 additions & 1 deletion src/AddAtomicMutex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,8 @@ class AddAtomicMutex : public IRMutator {
op->condition,
std::move(body),
op->new_expr,
op->free_function);
op->free_function,
op->padding);
}

Stmt visit(const ProducerConsumer *op) override {
Expand Down
2 changes: 1 addition & 1 deletion src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class NoOpCollapsingMutator : public IRMutator {
} else {
return Allocate::make(op->name, op->type, op->memory_type,
op->extents, op->condition, body,
op->new_expr, op->free_function);
op->new_expr, op->free_function, op->padding);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/BoundSmallAllocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class BoundSmallAllocations : public IRMutator {
if (size_ptr && size == 0 && !op->new_expr.defined()) {
// This allocation is dead
return Allocate::make(op->name, op->type, op->memory_type, {0}, const_false(),
mutate(op->body), op->new_expr, op->free_function);
mutate(op->body), op->new_expr, op->free_function, op->padding);
}

// 128 bytes is a typical minimum allocation size in
Expand All @@ -155,7 +155,7 @@ class BoundSmallAllocations : public IRMutator {
user_assert(size >= 0 && size < (int64_t)1 << 31)
<< "Allocation " << op->name << " has a size greater than 2^31: " << bound << "\n";
return Allocate::make(op->name, op->type, op->memory_type, {(int32_t)size}, op->condition,
mutate(op->body), op->new_expr, op->free_function);
mutate(op->body), op->new_expr, op->free_function, op->padding);
} else {
return IRMutator::visit(op);
}
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ set(HEADER_FILES
SlidingWindow.h
Solve.h
SplitTuples.h
StageStridedLoads.h
StmtToHtml.h
StorageFlattening.h
StorageFolding.h
Expand Down Expand Up @@ -324,6 +325,7 @@ set(SOURCE_FILES
Solve.cpp
SpirvIR.cpp
SplitTuples.cpp
StageStridedLoads.cpp
StmtToHtml.cpp
StorageFlattening.cpp
StorageFolding.cpp
Expand Down
112 changes: 110 additions & 2 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Debug.h"
#include "IREquality.h"
#include "IRMatch.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "IRPrinter.h"
#include "LLVM_Headers.h"
Expand All @@ -30,6 +31,71 @@ using namespace llvm;

namespace {

// Substitute in loads that feed into slicing shuffles, to help with vld2/3/4
// emission. These are commonly lifted as lets because they get used by multiple
// interleaved slices of the same load.
class SubstituteInStridedLoads : public IRMutator {
Scope<Expr> loads;
std::map<std::string, std::vector<std::string>> vars_per_buffer;
std::set<std::string> poisoned_vars;

template<typename LetOrLetStmt>
auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
const Load *l = op->value.template as<Load>();
const Ramp *r = l ? l->index.as<Ramp>() : nullptr;
auto body = op->body;
if (r && is_const_one(r->stride)) {
ScopedBinding bind(loads, op->name, op->value);
vars_per_buffer[l->name].push_back(op->name);
body = mutate(op->body);
vars_per_buffer[l->name].pop_back();
poisoned_vars.erase(l->name);
} else {
body = mutate(op->body);
}

// Unconditionally preserve the let, because there may be unsubstituted uses of
// it. It'll get dead-stripped by LLVM if not.
return LetOrLetStmt::make(op->name, op->value, body);
}

Expr visit(const Let *op) override {
return visit_let(op);
}

Stmt visit(const LetStmt *op) override {
return visit_let(op);
}

// Avoid substituting a load over an intervening store
Stmt visit(const Store *op) override {
auto it = vars_per_buffer.find(op->name);
if (it != vars_per_buffer.end()) {
for (const auto &v : it->second) {
poisoned_vars.insert(v);
}
}
return IRMutator::visit(op);
}

Expr visit(const Shuffle *op) override {
int stride = op->slice_stride();
const Variable *var = op->vectors[0].as<Variable>();
if (var &&
poisoned_vars.count(var->name) == 0 &&
op->vectors.size() == 1 &&
2 <= stride && stride <= 4 &&
op->slice_begin() < stride &&
loads.contains(var->name)) {
return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes());
} else {
return IRMutator::visit(op);
}
}

using IRMutator::visit;
};

/** A code generator that emits ARM code from a given Halide stmt. */
class CodeGen_ARM : public CodeGen_Posix {
public:
Expand All @@ -42,7 +108,10 @@ class CodeGen_ARM : public CodeGen_Posix {
/** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
* takes one vector argument and splits it into two to call inner. */
llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);

void init_module() override;
void compile_func(const LoweredFunc &f,
const std::string &simple_name, const std::string &extern_name) override;

/** Nodes for which we want to emit specific neon intrinsics */
// @{
Expand All @@ -52,6 +121,7 @@ class CodeGen_ARM : public CodeGen_Posix {
void visit(const Max *) override;
void visit(const Store *) override;
void visit(const Load *) override;
void visit(const Shuffle *) override;
void visit(const Call *) override;
void visit(const LT *) override;
void visit(const LE *) override;
Expand Down Expand Up @@ -770,6 +840,22 @@ void CodeGen_ARM::init_module() {
}
}

void CodeGen_ARM::compile_func(const LoweredFunc &f,
const string &simple_name,
const string &extern_name) {

LoweredFunc func = f;

if (target.os != Target::IOS && target.os != Target::OSX) {
// Substitute in strided loads to get vld2/3/4 emission. We don't do it
// on Apple silicon, because doing a dense load and then shuffling is
// actually faster.
func.body = SubstituteInStridedLoads().mutate(func.body);
}

CodeGen_Posix::compile_func(func, simple_name, extern_name);
}

void CodeGen_ARM::visit(const Cast *op) {
if (!neon_intrinsics_disabled() && op->type.is_vector()) {
vector<Expr> matches;
Expand Down Expand Up @@ -1092,9 +1178,9 @@ void CodeGen_ARM::visit(const Load *op) {
return;
}

// If the stride is in [-1, 4], we can deal with that using vanilla codegen
// If the stride is in [-1, 1], we can deal with that using vanilla codegen
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
if (stride && (-1 <= stride->value && stride->value <= 4)) {
if (stride && (-1 <= stride->value && stride->value <= 1)) {
CodeGen_Posix::visit(op);
return;
}
Expand Down Expand Up @@ -1123,6 +1209,28 @@ void CodeGen_ARM::visit(const Load *op) {
CodeGen_Posix::visit(op);
}

void CodeGen_ARM::visit(const Shuffle *op) {
// For small strided loads on non-Apple hardware, we may want to use vld2,
// vld3, vld4, etc. These show up in the IR as slice shuffles of wide dense
// loads. LLVM expects the same. The base codegen class breaks the loads
// into native vectors, which triggers shuffle instructions rather than
// vld2, vld3, vld4. So here we explicitly do the load as a single big dense
// load.
int stride = op->slice_stride();
const Load *load = op->vectors[0].as<Load>();
if (load &&
op->vectors.size() == 1 &&
2 <= stride && stride <= 4 &&
op->slice_begin() < stride &&
load->type.lanes() == stride * op->type.lanes() &&
target.os != Target::IOS && target.os != Target::OSX) {
rootjalex marked this conversation as resolved.
Show resolved Hide resolved
value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
value = shuffle_vectors(value, op->indices);
} else {
CodeGen_Posix::visit(op);
}
}

void CodeGen_ARM::visit(const Call *op) {
if (op->is_intrinsic(Call::sorted_avg)) {
value = codegen(halving_add(op->args[0], op->args[1]));
Expand Down
15 changes: 7 additions & 8 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,8 @@ class CppVectorOps {
}
}

template<int... Indices>
static Vec shuffle(const Vec &a) {
template<int... Indices, typename InputVec>
static Vec shuffle(const InputVec &a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
Vec r = { a[Indices]... };
return r;
Expand Down Expand Up @@ -1129,15 +1129,14 @@ class NativeVectorOps {
}
}

template<int... Indices>
static Vec shuffle(const Vec a) {
template<int... Indices, typename InputVec>
static Vec shuffle(const InputVec a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
#if __has_builtin(__builtin_shufflevector)
// Clang
// Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't
// be used, because it can't handle changing the number of vector
// lanes between input and output.
return __builtin_shufflevector(a, a, Indices...);
#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__)
// GCC
return __builtin_shuffle(a, NativeVector<int, sizeof...(Indices)>{Indices...});
#else
Vec r = { a[Indices]... };
return r;
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,

Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents,
std::move(new_condition), std::move(new_body),
std::move(new_new_expr), op->free_function);
std::move(new_new_expr), op->free_function, op->padding);

allocs.push_back(new_alloc);
replacements.erase(op->name);
Expand Down
20 changes: 9 additions & 11 deletions src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
void visit(const Allocate *) override;
///@}

/** We ask for an extra vector on each allocation to enable fast
* clamped ramp loads. */
int allocation_padding(Type type) const override {
return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8;
}

/** Call an LLVM intrinsic, potentially casting the operands to
* match the type of the function. */
///@{
Expand Down Expand Up @@ -123,7 +117,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
* list of its extents and its size. Fires a runtime assert
* (halide_error) if the size overflows 2^31 -1, the maximum
* positive number an int32_t can hold. */
llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents);
llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, int padding);

/** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */
llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255);
Expand Down Expand Up @@ -2099,7 +2093,8 @@ void CodeGen_Hexagon::visit(const Select *op) {
}

Value *CodeGen_Hexagon::codegen_cache_allocation_size(
const std::string &name, Type type, const std::vector<Expr> &extents) {
const std::string &name, Type type,
const std::vector<Expr> &extents, int padding) {
// Compute size from list of extents checking for overflow.

Expr overflow = make_zero(UInt(32));
Expand Down Expand Up @@ -2131,6 +2126,9 @@ Value *CodeGen_Hexagon::codegen_cache_allocation_size(
// is still an 8-bit number.
overflow = overflow | (total_size_hi >> 24);
}
int padding_bytes = padding * type.bytes();
overflow = overflow | (total_size + padding_bytes < total_size);
total_size += padding_bytes;

Expr max_size = make_const(UInt(32), target.maximum_buffer_size());
Expr size_check = (overflow == 0) && (total_size <= max_size);
Expand Down Expand Up @@ -2169,7 +2167,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
llvm_size = codegen(Expr(constant_bytes));
} else {
llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type,
alloc->extents);
alloc->extents, alloc->padding);
}

// Only allocate memory if the condition is true, otherwise 0.
Expand Down Expand Up @@ -2262,13 +2260,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
for (const auto &extent : alloc->extents) {
size *= extent;
}
size += allocation_padding(alloc->type);
size += alloc->padding * alloc->type.bytes();
Expr new_expr =
Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern);
string free_function = "halide_vtcm_free";
Stmt new_alloc = Allocate::make(
alloc->name, alloc->type, alloc->memory_type, alloc->extents,
alloc->condition, alloc->body, new_expr, free_function);
alloc->condition, alloc->body, new_expr, free_function, alloc->padding);
new_alloc.accept(this);
} else {
// For all other memory types
Expand Down
Loading