Explicitly stage strided loads (halide#7230)

* Add a pass to do explicit densification of strided loads * densify more types of strided load * Reorder downsample in local laplacian for slightly better performance * Move allocation padding into the IR. Still WIP. * Simplify concat_bits handling * Use evidence from parent scopes to densify * Disallow padding allocations with custom new expressions * Add test for parent scopes * Remove debugging prints. Avoid nested ramps. * Avoid parent scope loops * Update cmakefiles * Fix for large_buffers * Pad stack allocations too * Restore vld2/3/4 generation on non-Apple ARM chips * Appease clang-format and clang-tidy * Silence clang-tidy * Better comments * Comment improvements * Nuke code that reads out of bounds * Fix stage_strided_loads test * Change strategy for loads from external buffers Some backends don't like non-power-of-two vectors. Do two overlapping half-sized loads and shuffle instead of one funny-sized load. * Add explanatory comment to ARM backend * Fix cpp backend shuffling * Fix missing msan annotations * Magnify heap cost effect in stack_vs_heap performance test * Address review comments * clang-tidy * Fix for when same load node occurs in two different allocate nodes
ardier · Mar 3, 2024 · 15de537 · 15de537
1 parent 056a1c5
commit 15de537
Show file tree

Hide file tree

Showing 41 changed files with 1,004 additions and 210 deletions.
diff --git a/Makefile b/Makefile
@@ -559,6 +559,7 @@ SOURCE_FILES = \
  Solve.cpp \
  SpirvIR.cpp \
  SplitTuples.cpp \
+ StageStridedLoads.cpp \
  StmtToHtml.cpp \
  StorageFlattening.cpp \
  StorageFolding.cpp \
@@ -725,6 +726,7 @@ HEADER_FILES = \
  SlidingWindow.h \
  Solve.h \
  SplitTuples.h \
+ StageStridedLoads.h \
  StmtToHtml.h \
  StorageFlattening.h \
  StorageFolding.h \

diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -227,9 +227,9 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
  Func downsample(Func f) {
  using Halide::_;
  Func downx, downy;
- downx(x, y, _) = (f(2 * x - 1, y, _) + 3.0f * (f(2 * x, y, _) + f(2 * x + 1, y, _)) + f(2 * x + 2, y, _)) / 8.0f;
- downy(x, y, _) = (downx(x, 2 * y - 1, _) + 3.0f * (downx(x, 2 * y, _) + downx(x, 2 * y + 1, _)) + downx(x, 2 * y + 2, _)) / 8.0f;
- return downy;
+ downy(x, y, _) = (f(x, 2 * y - 1, _) + 3.0f * (f(x, 2 * y, _) + f(x, 2 * y + 1, _)) + f(x, 2 * y + 2, _)) / 8.0f;
+ downx(x, y, _) = (downy(2 * x - 1, y, _) + 3.0f * (downy(2 * x, y, _) + downy(2 * x + 1, y, _)) + downy(2 * x + 2, y, _)) / 8.0f;
+ return downx;
  }
 
  // Upsample using bilinear interpolation

diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp
@@ -338,7 +338,8 @@ class AddAtomicMutex : public IRMutator {
  op->condition,
  std::move(body),
  op->new_expr,
- op->free_function);
+ op->free_function,
+ op->padding);
  }
 
  Stmt visit(const ProducerConsumer *op) override {

diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
@@ -80,7 +80,7 @@ class NoOpCollapsingMutator : public IRMutator {
  } else {
  return Allocate::make(op->name, op->type, op->memory_type,
  op->extents, op->condition, body,
- op->new_expr, op->free_function);
+ op->new_expr, op->free_function, op->padding);
  }
  }
 

diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
@@ -140,7 +140,7 @@ class BoundSmallAllocations : public IRMutator {
  if (size_ptr && size == 0 && !op->new_expr.defined()) {
  // This allocation is dead
  return Allocate::make(op->name, op->type, op->memory_type, {0}, const_false(),
- mutate(op->body), op->new_expr, op->free_function);
+ mutate(op->body), op->new_expr, op->free_function, op->padding);
  }
 
  // 128 bytes is a typical minimum allocation size in
@@ -155,7 +155,7 @@ class BoundSmallAllocations : public IRMutator {
  user_assert(size >= 0 && size < (int64_t)1 << 31)
  << "Allocation " << op->name << " has a size greater than 2^31: " << bound << "\n";
  return Allocate::make(op->name, op->type, op->memory_type, {(int32_t)size}, op->condition,
- mutate(op->body), op->new_expr, op->free_function);
+ mutate(op->body), op->new_expr, op->free_function, op->padding);
  } else {
  return IRMutator::visit(op);
  }

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -145,6 +145,7 @@ set(HEADER_FILES
  SlidingWindow.h
  Solve.h
  SplitTuples.h
+ StageStridedLoads.h
  StmtToHtml.h
  StorageFlattening.h
  StorageFolding.h
@@ -324,6 +325,7 @@ set(SOURCE_FILES
  Solve.cpp
  SpirvIR.cpp
  SplitTuples.cpp
+ StageStridedLoads.cpp
  StmtToHtml.cpp
  StorageFlattening.cpp
  StorageFolding.cpp

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -8,6 +8,7 @@
 #include "Debug.h"
 #include "IREquality.h"
 #include "IRMatch.h"
+#include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "LLVM_Headers.h"
@@ -30,6 +31,71 @@ using namespace llvm;
 
 namespace {
 
+// Substitute in loads that feed into slicing shuffles, to help with vld2/3/4
+// emission. These are commonly lifted as lets because they get used by multiple
+// interleaved slices of the same load.
+class SubstituteInStridedLoads : public IRMutator {
+ Scope<Expr> loads;
+ std::map<std::string, std::vector<std::string>> vars_per_buffer;
+ std::set<std::string> poisoned_vars;
+
+ template<typename LetOrLetStmt>
+ auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
+ const Load *l = op->value.template as<Load>();
+ const Ramp *r = l ? l->index.as<Ramp>() : nullptr;
+ auto body = op->body;
+ if (r && is_const_one(r->stride)) {
+ ScopedBinding bind(loads, op->name, op->value);
+ vars_per_buffer[l->name].push_back(op->name);
+ body = mutate(op->body);
+ vars_per_buffer[l->name].pop_back();
+ poisoned_vars.erase(l->name);
+ } else {
+ body = mutate(op->body);
+ }
+
+ // Unconditionally preserve the let, because there may be unsubstituted uses of
+ // it. It'll get dead-stripped by LLVM if not.
+ return LetOrLetStmt::make(op->name, op->value, body);
+ }
+
+ Expr visit(const Let *op) override {
+ return visit_let(op);
+ }
+
+ Stmt visit(const LetStmt *op) override {
+ return visit_let(op);
+ }
+
+ // Avoid substituting a load over an intervening store
+ Stmt visit(const Store *op) override {
+ auto it = vars_per_buffer.find(op->name);
+ if (it != vars_per_buffer.end()) {
+ for (const auto &v : it->second) {
+ poisoned_vars.insert(v);
+ }
+ }
+ return IRMutator::visit(op);
+ }
+
+ Expr visit(const Shuffle *op) override {
+ int stride = op->slice_stride();
+ const Variable *var = op->vectors[0].as<Variable>();
+ if (var &&
+ poisoned_vars.count(var->name) == 0 &&
+ op->vectors.size() == 1 &&
+ 2 <= stride && stride <= 4 &&
+ op->slice_begin() < stride &&
+ loads.contains(var->name)) {
+ return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes());
+ } else {
+ return IRMutator::visit(op);
+ }
+ }
+
+ using IRMutator::visit;
+};
+
 /** A code generator that emits ARM code from a given Halide stmt. */
 class CodeGen_ARM : public CodeGen_Posix {
 public:
@@ -42,7 +108,10 @@ class CodeGen_ARM : public CodeGen_Posix {
  /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
  * takes one vector argument and splits it into two to call inner. */
  llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);
+
  void init_module() override;
+ void compile_func(const LoweredFunc &f,
+ const std::string &simple_name, const std::string &extern_name) override;
 
  /** Nodes for which we want to emit specific neon intrinsics */
  // @{
@@ -52,6 +121,7 @@ class CodeGen_ARM : public CodeGen_Posix {
  void visit(const Max *) override;
  void visit(const Store *) override;
  void visit(const Load *) override;
+ void visit(const Shuffle *) override;
  void visit(const Call *) override;
  void visit(const LT *) override;
  void visit(const LE *) override;
@@ -770,6 +840,22 @@ void CodeGen_ARM::init_module() {
  }
 }
 
+void CodeGen_ARM::compile_func(const LoweredFunc &f,
+ const string &simple_name,
+ const string &extern_name) {
+
+ LoweredFunc func = f;
+
+ if (target.os != Target::IOS && target.os != Target::OSX) {
+ // Substitute in strided loads to get vld2/3/4 emission. We don't do it
+ // on Apple silicon, because doing a dense load and then shuffling is
+ // actually faster.
+ func.body = SubstituteInStridedLoads().mutate(func.body);
+ }
+
+ CodeGen_Posix::compile_func(func, simple_name, extern_name);
+}
+
 void CodeGen_ARM::visit(const Cast *op) {
  if (!neon_intrinsics_disabled() && op->type.is_vector()) {
  vector<Expr> matches;
@@ -1092,9 +1178,9 @@ void CodeGen_ARM::visit(const Load *op) {
  return;
  }
 
- // If the stride is in [-1, 4], we can deal with that using vanilla codegen
+ // If the stride is in [-1, 1], we can deal with that using vanilla codegen
  const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
- if (stride && (-1 <= stride->value && stride->value <= 4)) {
+ if (stride && (-1 <= stride->value && stride->value <= 1)) {
  CodeGen_Posix::visit(op);
  return;
  }
@@ -1123,6 +1209,29 @@ void CodeGen_ARM::visit(const Load *op) {
  CodeGen_Posix::visit(op);
 }
 
+void CodeGen_ARM::visit(const Shuffle *op) {
+ // For small strided loads on non-Apple hardware, we may want to use vld2,
+ // vld3, vld4, etc. These show up in the IR as slice shuffles of wide dense
+ // loads. LLVM expects the same. The base codegen class breaks the loads
+ // into native vectors, which triggers shuffle instructions rather than
+ // vld2, vld3, vld4. So here we explicitly do the load as a single big dense
+ // load.
+ int stride = op->slice_stride();
+ const Load *load = op->vectors[0].as<Load>();
+ if (target.os != Target::IOS && target.os != Target::OSX &&
+ load &&
+ op->vectors.size() == 1 &&
+ 2 <= stride && stride <= 4 &&
+ op->slice_begin() < stride &&
+ load->type.lanes() == stride * op->type.lanes()) {
+
+ value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
+ value = shuffle_vectors(value, op->indices);
+ } else {
+ CodeGen_Posix::visit(op);
+ }
+}
+
 void CodeGen_ARM::visit(const Call *op) {
  if (op->is_intrinsic(Call::sorted_avg)) {
  value = codegen(halving_add(op->args[0], op->args[1]));

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
@@ -596,8 +596,8 @@ class CppVectorOps {
  }
  }
 
- template<int... Indices>
- static Vec shuffle(const Vec &a) {
+ template<int... Indices, typename InputVec>
+ static Vec shuffle(const InputVec &a) {
  static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
  Vec r = { a[Indices]... };
  return r;
@@ -1129,15 +1129,14 @@ class NativeVectorOps {
  }
  }
 
- template<int... Indices>
- static Vec shuffle(const Vec a) {
+ template<int... Indices, typename InputVec>
+ static Vec shuffle(const InputVec a) {
  static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
 #if __has_builtin(__builtin_shufflevector)
- // Clang
+ // Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't
+ // be used, because it can't handle changing the number of vector
+ // lanes between input and output.
  return __builtin_shufflevector(a, a, Indices...);
-#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__)
- // GCC
- return __builtin_shuffle(a, NativeVector<int, sizeof...(Indices)>{Indices...});
 #else
  Vec r = { a[Indices]... };
  return r;

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -1059,7 +1059,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
 
  Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents,
  std::move(new_condition), std::move(new_body),
- std::move(new_new_expr), op->free_function);
+ std::move(new_new_expr), op->free_function, op->padding);
 
  allocs.push_back(new_alloc);
  replacements.erase(op->name);

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -70,12 +70,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
  void visit(const Allocate *) override;
  ///@}
 
- /** We ask for an extra vector on each allocation to enable fast
- * clamped ramp loads. */
- int allocation_padding(Type type) const override {
- return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8;
- }
-
  /** Call an LLVM intrinsic, potentially casting the operands to
  * match the type of the function. */
  ///@{
@@ -123,7 +117,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
  * list of its extents and its size. Fires a runtime assert
  * (halide_error) if the size overflows 2^31 -1, the maximum
  * positive number an int32_t can hold. */
- llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents);
+ llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, int padding);
 
  /** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */
  llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255);
@@ -2099,7 +2093,8 @@ void CodeGen_Hexagon::visit(const Select *op) {
 }
 
 Value *CodeGen_Hexagon::codegen_cache_allocation_size(
- const std::string &name, Type type, const std::vector<Expr> &extents) {
+ const std::string &name, Type type,
+ const std::vector<Expr> &extents, int padding) {
  // Compute size from list of extents checking for overflow.
 
  Expr overflow = make_zero(UInt(32));
@@ -2131,6 +2126,9 @@ Value *CodeGen_Hexagon::codegen_cache_allocation_size(
  // is still an 8-bit number.
  overflow = overflow | (total_size_hi >> 24);
  }
+ int padding_bytes = padding * type.bytes();
+ overflow = overflow | (total_size + padding_bytes < total_size);
+ total_size += padding_bytes;
 
  Expr max_size = make_const(UInt(32), target.maximum_buffer_size());
  Expr size_check = (overflow == 0) && (total_size <= max_size);
@@ -2169,7 +2167,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
  llvm_size = codegen(Expr(constant_bytes));
  } else {
  llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type,
- alloc->extents);
+ alloc->extents, alloc->padding);
  }
 
  // Only allocate memory if the condition is true, otherwise 0.
@@ -2262,13 +2260,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
  for (const auto &extent : alloc->extents) {
  size *= extent;
  }
- size += allocation_padding(alloc->type);
+ size += alloc->padding * alloc->type.bytes();
  Expr new_expr =
  Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern);
  string free_function = "halide_vtcm_free";
  Stmt new_alloc = Allocate::make(
  alloc->name, alloc->type, alloc->memory_type, alloc->extents,
- alloc->condition, alloc->body, new_expr, free_function);
+ alloc->condition, alloc->body, new_expr, free_function, alloc->padding);
  new_alloc.accept(this);
  } else {
  // For all other memory types