diff --git a/Makefile b/Makefile index 9f2e1f3cf009..5c474187e7cd 100644 --- a/Makefile +++ b/Makefile @@ -559,6 +559,7 @@ SOURCE_FILES = \ Solve.cpp \ SpirvIR.cpp \ SplitTuples.cpp \ + StageStridedLoads.cpp \ StmtToHtml.cpp \ StorageFlattening.cpp \ StorageFolding.cpp \ @@ -725,6 +726,7 @@ HEADER_FILES = \ SlidingWindow.h \ Solve.h \ SplitTuples.h \ + StageStridedLoads.h \ StmtToHtml.h \ StorageFlattening.h \ StorageFolding.h \ diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index ee6e7dc09c57..9c43105a18c4 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -227,9 +227,9 @@ class LocalLaplacian : public Halide::Generator { Func downsample(Func f) { using Halide::_; Func downx, downy; - downx(x, y, _) = (f(2 * x - 1, y, _) + 3.0f * (f(2 * x, y, _) + f(2 * x + 1, y, _)) + f(2 * x + 2, y, _)) / 8.0f; - downy(x, y, _) = (downx(x, 2 * y - 1, _) + 3.0f * (downx(x, 2 * y, _) + downx(x, 2 * y + 1, _)) + downx(x, 2 * y + 2, _)) / 8.0f; - return downy; + downy(x, y, _) = (f(x, 2 * y - 1, _) + 3.0f * (f(x, 2 * y, _) + f(x, 2 * y + 1, _)) + f(x, 2 * y + 2, _)) / 8.0f; + downx(x, y, _) = (downy(2 * x - 1, y, _) + 3.0f * (downy(2 * x, y, _) + downy(2 * x + 1, y, _)) + downy(2 * x + 2, y, _)) / 8.0f; + return downx; } // Upsample using bilinear interpolation diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp index 08debac9e9e7..a2bf990e38f6 100644 --- a/src/AddAtomicMutex.cpp +++ b/src/AddAtomicMutex.cpp @@ -338,7 +338,8 @@ class AddAtomicMutex : public IRMutator { op->condition, std::move(body), op->new_expr, - op->free_function); + op->free_function, + op->padding); } Stmt visit(const ProducerConsumer *op) override { diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 3f64e4b1ab34..00f27603d971 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -80,7 +80,7 @@ class NoOpCollapsingMutator : public IRMutator { } else { return Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition, body, - op->new_expr, op->free_function); + op->new_expr, op->free_function, op->padding); } } diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp index ae492d00d908..f6a86f8a3e2a 100644 --- a/src/BoundSmallAllocations.cpp +++ b/src/BoundSmallAllocations.cpp @@ -140,7 +140,7 @@ class BoundSmallAllocations : public IRMutator { if (size_ptr && size == 0 && !op->new_expr.defined()) { // This allocation is dead return Allocate::make(op->name, op->type, op->memory_type, {0}, const_false(), - mutate(op->body), op->new_expr, op->free_function); + mutate(op->body), op->new_expr, op->free_function, op->padding); } // 128 bytes is a typical minimum allocation size in @@ -155,7 +155,7 @@ class BoundSmallAllocations : public IRMutator { user_assert(size >= 0 && size < (int64_t)1 << 31) << "Allocation " << op->name << " has a size greater than 2^31: " << bound << "\n"; return Allocate::make(op->name, op->type, op->memory_type, {(int32_t)size}, op->condition, - mutate(op->body), op->new_expr, op->free_function); + mutate(op->body), op->new_expr, op->free_function, op->padding); } else { return IRMutator::visit(op); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cc9f6805ba4a..4ce6105ecc5f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -145,6 +145,7 @@ set(HEADER_FILES SlidingWindow.h Solve.h SplitTuples.h + StageStridedLoads.h StmtToHtml.h StorageFlattening.h StorageFolding.h @@ -324,6 +325,7 @@ set(SOURCE_FILES Solve.cpp SpirvIR.cpp SplitTuples.cpp + StageStridedLoads.cpp StmtToHtml.cpp StorageFlattening.cpp StorageFolding.cpp diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 15f15d7142ec..7ffbc3b79b5e 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -8,6 +8,7 @@ #include "Debug.h" #include "IREquality.h" #include "IRMatch.h" +#include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "LLVM_Headers.h" @@ -30,6 +31,71 @@ using namespace llvm; namespace { +// Substitute in loads that feed into slicing shuffles, to help with vld2/3/4 +// emission. These are commonly lifted as lets because they get used by multiple +// interleaved slices of the same load. +class SubstituteInStridedLoads : public IRMutator { + Scope loads; + std::map> vars_per_buffer; + std::set poisoned_vars; + + template + auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) { + const Load *l = op->value.template as(); + const Ramp *r = l ? l->index.as() : nullptr; + auto body = op->body; + if (r && is_const_one(r->stride)) { + ScopedBinding bind(loads, op->name, op->value); + vars_per_buffer[l->name].push_back(op->name); + body = mutate(op->body); + vars_per_buffer[l->name].pop_back(); + poisoned_vars.erase(l->name); + } else { + body = mutate(op->body); + } + + // Unconditionally preserve the let, because there may be unsubstituted uses of + // it. It'll get dead-stripped by LLVM if not. + return LetOrLetStmt::make(op->name, op->value, body); + } + + Expr visit(const Let *op) override { + return visit_let(op); + } + + Stmt visit(const LetStmt *op) override { + return visit_let(op); + } + + // Avoid substituting a load over an intervening store + Stmt visit(const Store *op) override { + auto it = vars_per_buffer.find(op->name); + if (it != vars_per_buffer.end()) { + for (const auto &v : it->second) { + poisoned_vars.insert(v); + } + } + return IRMutator::visit(op); + } + + Expr visit(const Shuffle *op) override { + int stride = op->slice_stride(); + const Variable *var = op->vectors[0].as(); + if (var && + poisoned_vars.count(var->name) == 0 && + op->vectors.size() == 1 && + 2 <= stride && stride <= 4 && + op->slice_begin() < stride && + loads.contains(var->name)) { + return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes()); + } else { + return IRMutator::visit(op); + } + } + + using IRMutator::visit; +}; + /** A code generator that emits ARM code from a given Halide stmt. */ class CodeGen_ARM : public CodeGen_Posix { public: @@ -42,7 +108,10 @@ class CodeGen_ARM : public CodeGen_Posix { /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that * takes one vector argument and splits it into two to call inner. */ llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name); + void init_module() override; + void compile_func(const LoweredFunc &f, + const std::string &simple_name, const std::string &extern_name) override; /** Nodes for which we want to emit specific neon intrinsics */ // @{ @@ -52,6 +121,7 @@ class CodeGen_ARM : public CodeGen_Posix { void visit(const Max *) override; void visit(const Store *) override; void visit(const Load *) override; + void visit(const Shuffle *) override; void visit(const Call *) override; void visit(const LT *) override; void visit(const LE *) override; @@ -770,6 +840,22 @@ void CodeGen_ARM::init_module() { } } +void CodeGen_ARM::compile_func(const LoweredFunc &f, + const string &simple_name, + const string &extern_name) { + + LoweredFunc func = f; + + if (target.os != Target::IOS && target.os != Target::OSX) { + // Substitute in strided loads to get vld2/3/4 emission. We don't do it + // on Apple silicon, because doing a dense load and then shuffling is + // actually faster. + func.body = SubstituteInStridedLoads().mutate(func.body); + } + + CodeGen_Posix::compile_func(func, simple_name, extern_name); +} + void CodeGen_ARM::visit(const Cast *op) { if (!neon_intrinsics_disabled() && op->type.is_vector()) { vector matches; @@ -1092,9 +1178,9 @@ void CodeGen_ARM::visit(const Load *op) { return; } - // If the stride is in [-1, 4], we can deal with that using vanilla codegen + // If the stride is in [-1, 1], we can deal with that using vanilla codegen const IntImm *stride = ramp ? ramp->stride.as() : nullptr; - if (stride && (-1 <= stride->value && stride->value <= 4)) { + if (stride && (-1 <= stride->value && stride->value <= 1)) { CodeGen_Posix::visit(op); return; } @@ -1123,6 +1209,29 @@ void CodeGen_ARM::visit(const Load *op) { CodeGen_Posix::visit(op); } +void CodeGen_ARM::visit(const Shuffle *op) { + // For small strided loads on non-Apple hardware, we may want to use vld2, + // vld3, vld4, etc. These show up in the IR as slice shuffles of wide dense + // loads. LLVM expects the same. The base codegen class breaks the loads + // into native vectors, which triggers shuffle instructions rather than + // vld2, vld3, vld4. So here we explicitly do the load as a single big dense + // load. + int stride = op->slice_stride(); + const Load *load = op->vectors[0].as(); + if (target.os != Target::IOS && target.os != Target::OSX && + load && + op->vectors.size() == 1 && + 2 <= stride && stride <= 4 && + op->slice_begin() < stride && + load->type.lanes() == stride * op->type.lanes()) { + + value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false); + value = shuffle_vectors(value, op->indices); + } else { + CodeGen_Posix::visit(op); + } +} + void CodeGen_ARM::visit(const Call *op) { if (op->is_intrinsic(Call::sorted_avg)) { value = codegen(halving_add(op->args[0], op->args[1])); diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index c7ab5a45cf23..b7df84b80ae5 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -596,8 +596,8 @@ class CppVectorOps { } } - template - static Vec shuffle(const Vec &a) { + template + static Vec shuffle(const InputVec &a) { static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes"); Vec r = { a[Indices]... }; return r; @@ -1129,15 +1129,14 @@ class NativeVectorOps { } } - template - static Vec shuffle(const Vec a) { + template + static Vec shuffle(const InputVec a) { static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes"); #if __has_builtin(__builtin_shufflevector) - // Clang + // Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't + // be used, because it can't handle changing the number of vector + // lanes between input and output. return __builtin_shufflevector(a, a, Indices...); -#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__) - // GCC - return __builtin_shuffle(a, NativeVector{Indices...}); #else Vec r = { a[Indices]... }; return r; diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index c84e8641770b..578b505b6932 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -1059,7 +1059,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents, std::move(new_condition), std::move(new_body), - std::move(new_new_expr), op->free_function); + std::move(new_new_expr), op->free_function, op->padding); allocs.push_back(new_alloc); replacements.erase(op->name); diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index b37d5990b6ba..87f732dc0d54 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -70,12 +70,6 @@ class CodeGen_Hexagon : public CodeGen_Posix { void visit(const Allocate *) override; ///@} - /** We ask for an extra vector on each allocation to enable fast - * clamped ramp loads. */ - int allocation_padding(Type type) const override { - return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8; - } - /** Call an LLVM intrinsic, potentially casting the operands to * match the type of the function. */ ///@{ @@ -123,7 +117,7 @@ class CodeGen_Hexagon : public CodeGen_Posix { * list of its extents and its size. Fires a runtime assert * (halide_error) if the size overflows 2^31 -1, the maximum * positive number an int32_t can hold. */ - llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector &extents); + llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector &extents, int padding); /** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */ llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255); @@ -2099,7 +2093,8 @@ void CodeGen_Hexagon::visit(const Select *op) { } Value *CodeGen_Hexagon::codegen_cache_allocation_size( - const std::string &name, Type type, const std::vector &extents) { + const std::string &name, Type type, + const std::vector &extents, int padding) { // Compute size from list of extents checking for overflow. Expr overflow = make_zero(UInt(32)); @@ -2131,6 +2126,9 @@ Value *CodeGen_Hexagon::codegen_cache_allocation_size( // is still an 8-bit number. overflow = overflow | (total_size_hi >> 24); } + int padding_bytes = padding * type.bytes(); + overflow = overflow | (total_size + padding_bytes < total_size); + total_size += padding_bytes; Expr max_size = make_const(UInt(32), target.maximum_buffer_size()); Expr size_check = (overflow == 0) && (total_size <= max_size); @@ -2169,7 +2167,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) { llvm_size = codegen(Expr(constant_bytes)); } else { llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type, - alloc->extents); + alloc->extents, alloc->padding); } // Only allocate memory if the condition is true, otherwise 0. @@ -2262,13 +2260,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) { for (const auto &extent : alloc->extents) { size *= extent; } - size += allocation_padding(alloc->type); + size += alloc->padding * alloc->type.bytes(); Expr new_expr = Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern); string free_function = "halide_vtcm_free"; Stmt new_alloc = Allocate::make( alloc->name, alloc->type, alloc->memory_type, alloc->extents, - alloc->condition, alloc->body, new_expr, free_function); + alloc->condition, alloc->body, new_expr, free_function, alloc->padding); new_alloc.accept(this); } else { // For all other memory types diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 425859a4f7a5..5d8401620571 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2041,73 +2041,6 @@ void CodeGen_LLVM::visit(const Load *op) { llvm::Type *load_type = llvm_type_of(op->type.element_of()); if (ramp && stride && stride->value == 1) { value = codegen_dense_vector_load(op); - } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { - // Try to rewrite strided loads as shuffles of dense loads, - // aligned to the stride. This makes adjacent strided loads - // share the same underlying dense loads. - Expr base = ramp->base; - // The variable align will track the alignment of the - // base. Every time we change base, we also need to update - // align. - ModulusRemainder align = op->alignment; - - int aligned_stride = gcd(stride->value, align.modulus); - int offset = 0; - if (aligned_stride == stride->value) { - offset = mod_imp((int)align.remainder, aligned_stride); - } else { - const Add *add = base.as(); - if (const IntImm *add_c = add ? add->b.as() : base.as()) { - offset = mod_imp(add_c->value, stride->value); - } - } - - if (offset) { - base = simplify(base - offset); - align.remainder = mod_imp(align.remainder - offset, align.modulus); - } - - // We want to load a few more bytes than the original load did. - // We know this is safe for internal buffers because we allocate - // padding. - // (In ASAN mode, don't read beyond the end of internal buffers either, - // as ASAN will complain even about harmless stack overreads.) - // The min moves lower by offset. - int load_lanes = ramp->lanes * stride->value; - bool external = op->param.defined() || op->image.defined(); - if (external || target.has_feature(Target::ASAN)) { - load_lanes -= (stride->value - 1 - offset); - } - - int slice_lanes = native_vector_bits() / op->type.bits(); - - // We're going to add multiples of slice_lanes to base in - // the loop below, so reduce alignment modulo slice_lanes. - align.modulus = gcd(align.modulus, slice_lanes); - align.remainder = mod_imp(align.remainder, align.modulus); - - // We need to slice the result in to native vector lanes, otherwise - // LLVM misses optimizations like using ldN on ARM. - vector results; - for (int i = 0; i < op->type.lanes(); i += slice_lanes) { - int load_base_i = i * stride->value; - int load_lanes_i = std::min(slice_lanes * stride->value, load_lanes - load_base_i); - int lanes_i = std::min(slice_lanes, op->type.lanes() - i); - Expr slice_base = simplify(base + load_base_i); - - Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, - op->image, op->param, align, /*vpred=*/nullptr, - /*slice_to_native=*/false); - - std::vector constants; - for (int j = 0; j < lanes_i; j++) { - constants.push_back(j * stride->value + offset); - } - results.push_back(shuffle_vectors(load_i, constants)); - } - - // Concat the results - value = concat_vectors(results); } else if (ramp && stride && stride->value == -1) { // Load the vector and then flip it in-place Expr flipped_base = ramp->base - ramp->lanes + 1; diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index b132dac1d314..8128f3791469 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -630,6 +630,11 @@ class CodeGen_LLVM : public IRVisitor { bool use_llvm_vp_intrinsics; // @} + /** Generate a basic dense vector load, with an optional predicate and + * control over whether or not we should slice the load into native + * vectors. Used by CodeGen_ARM to help with vld2/3/4 emission. */ + llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); + private: /** All the values in scope at the current code location during * codegen. Use sym_push and sym_pop to access. */ @@ -674,7 +679,6 @@ class CodeGen_LLVM : public IRVisitor { llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base, const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr); - llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); virtual void codegen_predicated_load(const Load *op); virtual void codegen_predicated_store(const Store *op); diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp index 3597c035befe..af508194b06e 100644 --- a/src/CodeGen_Posix.cpp +++ b/src/CodeGen_Posix.cpp @@ -74,22 +74,15 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type return codegen(total_size); } -int CodeGen_Posix::allocation_padding(Type type) const { - // We potentially load 3 scalar values past the end of the - // buffer, so pad the allocation with an extra instance of the - // scalar type. - return 3 * type.bytes(); -} - CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type, const std::vector &extents, const Expr &condition, - const Expr &new_expr, std::string free_function) { + const Expr &new_expr, std::string free_function, int padding) { Value *llvm_size = nullptr; int64_t stack_bytes = 0; int32_t constant_bytes = Allocate::constant_allocation_size(extents, name); if (constant_bytes > 0) { constant_bytes *= type.bytes(); - stack_bytes = constant_bytes; + stack_bytes = constant_bytes + padding * type.bytes(); if (stack_bytes > target.maximum_buffer_size()) { const string str_max_size = target.has_large_buffers() ? "2^63 - 1" : "2^31 - 1"; @@ -117,8 +110,8 @@ CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &na // Add the requested padding to the allocation size. If the // allocation is on the stack, we can just read past the top // of the stack, so we only need this for heap allocations. - Value *padding = ConstantInt::get(llvm_size->getType(), allocation_padding(type)); - llvm_size = builder->CreateAdd(llvm_size, padding); + Value *padding_bytes = ConstantInt::get(llvm_size->getType(), padding * type.bytes()); + llvm_size = builder->CreateAdd(llvm_size, padding_bytes); llvm_size = builder->CreateSelect(llvm_condition, llvm_size, ConstantInt::get(llvm_size->getType(), 0)); @@ -364,7 +357,7 @@ void CodeGen_Posix::visit(const Allocate *alloc) { Allocation allocation = create_allocation(alloc->name, alloc->type, alloc->memory_type, alloc->extents, alloc->condition, - alloc->new_expr, alloc->free_function); + alloc->new_expr, alloc->free_function, alloc->padding); sym_push(alloc->name, allocation.ptr); codegen(alloc->body); diff --git a/src/CodeGen_Posix.h b/src/CodeGen_Posix.h index 7a0101080368..41eb923cdc3d 100644 --- a/src/CodeGen_Posix.h +++ b/src/CodeGen_Posix.h @@ -28,12 +28,6 @@ class CodeGen_Posix : public CodeGen_LLVM { void visit(const Free *) override; // @} - /** It can be convenient for backends to assume there is extra - * padding beyond the end of a buffer to enable faster - * loads/stores. This function gets the padding required by the - * implementing target. */ - virtual int allocation_padding(Type type) const; - /** A struct describing heap or stack allocations. */ struct Allocation { /** The memory */ @@ -102,8 +96,8 @@ class CodeGen_Posix : public CodeGen_LLVM { * When the allocation can be freed call 'free_allocation', and * when it goes out of scope call 'destroy_allocation'. */ Allocation create_allocation(const std::string &name, Type type, MemoryType memory_type, - const std::vector &extents, - const Expr &condition, const Expr &new_expr, std::string free_function); + const std::vector &extents, const Expr &condition, + const Expr &new_expr, std::string free_function, int padding); /** Free an allocation previously allocated with * create_allocation */ diff --git a/src/EarlyFree.cpp b/src/EarlyFree.cpp index d72815bcf200..35de3c15cbcd 100644 --- a/src/EarlyFree.cpp +++ b/src/EarlyFree.cpp @@ -164,7 +164,7 @@ class InjectEarlyFrees : public IRMutator { stmt = Allocate::make(alloc->name, alloc->type, alloc->memory_type, alloc->extents, alloc->condition, Block::make(alloc->body, Free::make(alloc->name)), - alloc->new_expr, alloc->free_function); + alloc->new_expr, alloc->free_function, alloc->padding); } return stmt; } diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp index 803bd0b85b8f..f48bd75c37a2 100644 --- a/src/FlattenNestedRamps.cpp +++ b/src/FlattenNestedRamps.cpp @@ -118,40 +118,17 @@ class FlattenRamps : public IRMutator { } }; -/** Simplify bit concatenation of interleaved loads to vector reinterprets of - * dense loads. Must be done to both vectors and scalars after flattening nested - * ramps, because it can expand a flat ramp into a wider one. */ -class SimplifyConcatBits : public IRMutator { +/** Lower bit concatenation into vector interleaving followed by a vector + * reinterpret. */ +class LowerConcatBits : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { if (op->is_intrinsic(Call::concat_bits)) { - // Simplify a concat of a load of adjacent bits to a reinterpret of a load of a small vector. - const Load *l0 = op->args[0].as(); - bool ok = true; - const int n = (int)(op->args.size()); - for (int i = 0; ok && i < n; i++) { - const Load *li = op->args[i].as(); - ok &= (li != nullptr); - if (!ok) { - break; - } - const Ramp *r = li->index.as(); - Expr base = r ? r->base : li->index; - ok &= (is_const_one(li->predicate) && - l0->name == li->name && - can_prove(l0->index + i == li->index) && - (r == nullptr || is_const(r->stride, n))); - } - - if (ok) { - internal_assert(l0); - const Ramp *r0 = l0->index.as(); - int new_lanes = (r0 ? r0->lanes : 1) * n; - Expr base = r0 ? r0->base : l0->index; - Expr idx = Ramp::make(base, 1, new_lanes); - return mutate(Reinterpret::make(op->type, Load::make(l0->type.with_lanes(n * l0->type.lanes()), l0->name, idx, l0->image, l0->param, const_true(new_lanes), l0->alignment))); - } + // Rewrite concat_bits into a shuffle followed by a vector reinterpret. + Expr shuf = simplify(Shuffle::make_interleave(op->args)); + Expr e = Reinterpret::make(op->type, shuf); + return mutate(e); } return IRMutator::visit(op); @@ -161,11 +138,11 @@ class SimplifyConcatBits : public IRMutator { } // namespace Stmt flatten_nested_ramps(const Stmt &s) { - return SimplifyConcatBits().mutate(FlattenRamps().mutate(s)); + return LowerConcatBits().mutate(FlattenRamps().mutate(s)); } Expr flatten_nested_ramps(const Expr &e) { - return SimplifyConcatBits().mutate(FlattenRamps().mutate(e)); + return LowerConcatBits().mutate(FlattenRamps().mutate(e)); } } // namespace Internal diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index 8c8a189f0171..88ab25e9038b 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -1996,6 +1996,22 @@ class OptimizeShuffles : public IRMutator { return visit_let(op); } + set allocations_to_pad; + Stmt visit(const Allocate *op) override { + Stmt s = IRMutator::visit(op); + if (allocations_to_pad.count(op->name)) { + op = s.as(); + internal_assert(op); + int padding = 128 / op->type.bytes(); // One native vector + return Allocate::make(op->name, op->type, op->memory_type, + op->extents, op->condition, + op->body, op->new_expr, op->free_function, + std::max(op->padding, padding)); + } else { + return s; + } + } + Expr visit(const Load *op) override { if (!is_const_one(op->predicate)) { // TODO(psuriana): We shouldn't mess with predicated load for now. @@ -2031,8 +2047,9 @@ class OptimizeShuffles : public IRMutator { // Load all of the possible indices loaded from the // LUT. Note that for clamped ramps, this loads up to 1 - // vector past the max. CodeGen_Hexagon::allocation_padding - // returns a native vector size to account for this. + // vector past the max, so we will add padding to the + // allocation accordingly (if we're the one that made it). + allocations_to_pad.insert(op->name); Expr lut = Load::make(op->type.with_lanes(const_extent), op->name, Ramp::make(base, 1, const_extent), op->image, op->param, const_true(const_extent), alignment); diff --git a/src/IR.cpp b/src/IR.cpp index 2ef454836517..244d142cfb60 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -409,7 +409,7 @@ Stmt Provide::make(const std::string &name, const std::vector &values, con Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type, const std::vector &extents, Expr condition, Stmt body, - Expr new_expr, const std::string &free_function) { + Expr new_expr, const std::string &free_function, int padding) { for (const auto &extent : extents) { internal_assert(extent.defined()) << "Allocate of undefined extent\n"; internal_assert(extent.type().is_scalar() == 1) << "Allocate of vector extent\n"; @@ -417,6 +417,8 @@ Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type, internal_assert(body.defined()) << "Allocate of undefined\n"; internal_assert(condition.defined()) << "Allocate with undefined condition\n"; internal_assert(condition.type().is_bool()) << "Allocate condition is not boolean\n"; + internal_assert(!(new_expr.defined() && padding)) + << "Allocate nodes with custom new expressions may not have padding\n"; Allocate *node = new Allocate; node->name = name; @@ -426,6 +428,7 @@ Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type, node->new_expr = std::move(new_expr); node->free_function = free_function; node->condition = std::move(condition); + node->padding = padding; node->body = std::move(body); return node; } diff --git a/src/IR.h b/src/IR.h index 1e52c296208f..c187053db81d 100644 --- a/src/IR.h +++ b/src/IR.h @@ -365,6 +365,8 @@ struct Allocate : public StmtNode { Type type; MemoryType memory_type; std::vector extents; + + // A boolean condition that determines if the allocation needs to be made at all. Expr condition; // These override the code generator dependent malloc and free @@ -377,18 +379,22 @@ struct Allocate : public StmtNode { Expr new_expr; std::string free_function; + // Extra padding elements to allow for overreads. Elements in the padding + // have undetermined values, but are guaranteed safe to load. + int padding; + Stmt body; static Stmt make(const std::string &name, Type type, MemoryType memory_type, const std::vector &extents, Expr condition, Stmt body, - Expr new_expr = Expr(), const std::string &free_function = std::string()); + Expr new_expr = Expr(), const std::string &free_function = std::string(), int padding = 0); /** A routine to check if the extents are all constants, and if so verify * the total size is less than 2^31 - 1. If the result is constant, but * overflows, this routine asserts. This returns 0 if the extents are * not all constants; otherwise, it returns the total constant allocation - * size. */ + * size. Does not include any padding bytes. */ static int32_t constant_allocation_size(const std::vector &extents, const std::string &name); int32_t constant_allocation_size() const; diff --git a/src/IREquality.cpp b/src/IREquality.cpp index 20cb616d2c32..b2511c778821 100644 --- a/src/IREquality.cpp +++ b/src/IREquality.cpp @@ -649,6 +649,11 @@ bool graph_equal(const Expr &a, const Expr &b) { return IRComparer(&cache).compare_expr(a, b) == IRComparer::Equal; } +bool graph_less_than(const Expr &a, const Expr &b) { + IRCompareCache cache(8); + return IRComparer(&cache).compare_expr(a, b) == IRComparer::LessThan; +} + bool equal(const Stmt &a, const Stmt &b) { return IRComparer().compare_stmt(a, b) == IRComparer::Equal; } @@ -658,6 +663,11 @@ bool graph_equal(const Stmt &a, const Stmt &b) { return IRComparer(&cache).compare_stmt(a, b) == IRComparer::Equal; } +bool graph_less_than(const Stmt &a, const Stmt &b) { + IRCompareCache cache(8); + return IRComparer(&cache).compare_stmt(a, b) == IRComparer::LessThan; +} + bool IRDeepCompare::operator()(const Expr &a, const Expr &b) const { IRComparer cmp; cmp.compare_expr(a, b); diff --git a/src/IREquality.h b/src/IREquality.h index b4d5a9a36cc5..1d59d19446e2 100644 --- a/src/IREquality.h +++ b/src/IREquality.h @@ -116,6 +116,12 @@ bool graph_equal(const Expr &a, const Expr &b); bool graph_equal(const Stmt &a, const Stmt &b); // @} +/** Order unsanitized IRNodes for use in a map key */ +// @{ +bool graph_less_than(const Expr &a, const Expr &b); +bool graph_less_than(const Stmt &a, const Stmt &b); +// @} + void ir_equality_test(); } // namespace Internal diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index 005937a17008..85b1d143568a 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -251,7 +251,7 @@ Stmt IRMutator::visit(const Allocate *op) { } return Allocate::make(op->name, op->type, op->memory_type, new_extents, std::move(condition), - std::move(body), std::move(new_expr), op->free_function); + std::move(body), std::move(new_expr), op->free_function, op->padding); } Stmt IRMutator::visit(const Free *op) { diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index 38f57e46649e..3b4aa2ea424c 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -884,10 +884,18 @@ void IRPrinter::visit(const Provide *op) { void IRPrinter::visit(const Allocate *op) { ScopedBinding<> bind(known_type, op->name); stream << get_indent() << "allocate " << op->name << "[" << op->type; + bool first = true; for (const auto &extent : op->extents) { stream << " * "; + if (first && op->padding) { + stream << "("; + first = false; + } print(extent); } + if (op->padding) { + stream << " + " << op->padding << ")"; + } stream << "]"; if (op->memory_type != MemoryType::Auto) { stream << " in " << op->memory_type; diff --git a/src/InjectHostDevBufferCopies.cpp b/src/InjectHostDevBufferCopies.cpp index d48e8b5f2601..6cdaabaeea1f 100644 --- a/src/InjectHostDevBufferCopies.cpp +++ b/src/InjectHostDevBufferCopies.cpp @@ -693,7 +693,7 @@ class InjectBufferCopies : public IRMutator { } return Allocate::make(op->name, op->type, op->memory_type, op->extents, - condition, body, op->new_expr, op->free_function); + condition, body, op->new_expr, op->free_function, op->padding); } } diff --git a/src/LICM.cpp b/src/LICM.cpp index 386a05bd1808..0f4ce93824b8 100644 --- a/src/LICM.cpp +++ b/src/LICM.cpp @@ -618,7 +618,7 @@ class HoistIfStatements : public IRMutator { is_pure(i->condition)) { Stmt s = Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition, i->then_case, - op->new_expr, op->free_function); + op->new_expr, op->free_function, op->padding); return IfThenElse::make(i->condition, s); } } @@ -627,7 +627,7 @@ class HoistIfStatements : public IRMutator { } else { return Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition, body, - op->new_expr, op->free_function); + op->new_expr, op->free_function, op->padding); } } diff --git a/src/Lower.cpp b/src/Lower.cpp index 38ad867686e6..6fa07736b352 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -62,6 +62,7 @@ #include "SkipStages.h" #include "SlidingWindow.h" #include "SplitTuples.h" +#include "StageStridedLoads.h" #include "StorageFlattening.h" #include "StorageFolding.h" #include "StrictifyFloat.h" @@ -336,6 +337,10 @@ void lower_impl(const vector &output_funcs, s = simplify(s); log("Lowering after partitioning loops:", s); + debug(1) << "Staging strided loads...\n"; + s = stage_strided_loads(s); + log("Lowering after staging strided loads:", s); + debug(1) << "Trimming loops to the region over which they do something...\n"; s = trim_no_ops(s); log("Lowering after loop trimming:", s); diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp index e8cd8f39b10d..ac61cb94741e 100644 --- a/src/LowerWarpShuffles.cpp +++ b/src/LowerWarpShuffles.cpp @@ -435,7 +435,7 @@ class LowerWarpShuffles : public IRMutator { allocation_info.pop(alloc->name); body = Allocate::make(alloc->name, alloc->type, alloc->memory_type, {new_size}, alloc->condition, - body, alloc->new_expr, alloc->free_function); + body, alloc->new_expr, alloc->free_function, alloc->padding); } allocations.clear(); @@ -451,7 +451,7 @@ class LowerWarpShuffles : public IRMutator { const Allocate *alloc = s.as(); body = Allocate::make(alloc->name, alloc->type, alloc->memory_type, alloc->extents, alloc->condition, - body, alloc->new_expr, alloc->free_function); + body, alloc->new_expr, alloc->free_function, alloc->padding); } allocations.clear(); diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 678752f248f9..6c69d03e2bad 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -877,7 +877,8 @@ class RenormalizeGPULoops : public IRMutator { return IRMutator::visit(op); } else { Stmt inner = LetStmt::make(op->name, op->value, a->body); - inner = Allocate::make(a->name, a->type, a->memory_type, a->extents, a->condition, inner); + inner = Allocate::make(a->name, a->type, a->memory_type, a->extents, a->condition, inner, + a->new_expr, a->free_function, a->padding); return mutate(inner); } } else { @@ -912,7 +913,8 @@ class RenormalizeGPULoops : public IRMutator { Stmt inner = IfThenElse::make(op->condition, allocate_a->body, allocate_b->body); inner = Allocate::make(allocate_a->name, allocate_a->type, allocate_a->memory_type, allocate_a->extents, - allocate_a->condition, inner); + allocate_a->condition, inner, allocate_a->new_expr, + allocate_a->free_function, allocate_a->padding); return mutate(inner); } else if (let_a && let_b && let_a->name == let_b->name) { string condition_name = unique_name('t'); diff --git a/src/Profiling.cpp b/src/Profiling.cpp index 28a90aeaf7a1..a2fa2984fa18 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -235,7 +235,8 @@ class InjectProfiling : public IRMutator { stmt = op; } else { stmt = Allocate::make(op->name, op->type, op->memory_type, - new_extents, condition, body, new_expr, op->free_function); + new_extents, condition, body, new_expr, + op->free_function, op->padding); } tasks.push_back(stmt); diff --git a/src/RemoveDeadAllocations.cpp b/src/RemoveDeadAllocations.cpp index 101607b7c6a0..33a1a0190b07 100644 --- a/src/RemoveDeadAllocations.cpp +++ b/src/RemoveDeadAllocations.cpp @@ -62,8 +62,8 @@ class RemoveDeadAllocations : public IRMutator { } else if (body.same_as(op->body)) { return op; } else { - return Allocate::make(op->name, op->type, op->memory_type, op->extents, - op->condition, body, op->new_expr, op->free_function); + return Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition, + body, op->new_expr, op->free_function, op->padding); } } diff --git a/src/RemoveUndef.cpp b/src/RemoveUndef.cpp index a4889f6cc3b5..6a84edf84e2f 100644 --- a/src/RemoveUndef.cpp +++ b/src/RemoveUndef.cpp @@ -472,7 +472,8 @@ class RemoveUndef : public IRMutator { return op; } else { return Allocate::make(op->name, op->type, op->memory_type, - new_extents, condition, body, new_expr, op->free_function); + new_extents, condition, body, new_expr, + op->free_function, op->padding); } } diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp index 36df4df455b1..b1bac9f361d3 100644 --- a/src/Simplify_Stmts.cpp +++ b/src/Simplify_Stmts.cpp @@ -391,7 +391,7 @@ Stmt Simplify::visit(const Allocate *op) { // else case must not use it. Stmt stmt = Allocate::make(op->name, op->type, op->memory_type, new_extents, condition, body_if->then_case, - new_expr, op->free_function); + new_expr, op->free_function, op->padding); return IfThenElse::make(body_if->condition, stmt, body_if->else_case); } else if (all_extents_unmodified && body.same_as(op->body) && @@ -401,7 +401,7 @@ Stmt Simplify::visit(const Allocate *op) { } else { return Allocate::make(op->name, op->type, op->memory_type, new_extents, condition, body, - new_expr, op->free_function); + new_expr, op->free_function, op->padding); } } diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp new file mode 100644 index 000000000000..feeab56a4122 --- /dev/null +++ b/src/StageStridedLoads.cpp @@ -0,0 +1,368 @@ +#include "StageStridedLoads.h" +#include "CSE.h" +#include "IREquality.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "IRVisitor.h" +#include "Scope.h" +#include "Simplify.h" +#include "Substitute.h" + +namespace Halide { +namespace Internal { + +namespace { + +class FindStridedLoads : public IRVisitor { +public: + struct Key { + // The buffer being accessed. + std::string buf; + // The base index being accessed, without any constant offset. + Expr base; + + // The stride and lanes of the vector access. + int64_t stride; + int lanes; + + // The loaded type. + Type type; + + // The Allocate node the load belongs to. nullptr for loads from external buffers. + const Allocate *allocation; + + // The Stmt over which the load definitely happens, and definitely + // refers to the same buffer as other loads with the same name. nullptr + // means global scope. + const IRNode *scope; + + bool operator<(const Key &other) const { + // Check fields in order of cost to compare + if (stride < other.stride) { + return true; + } else if (stride > other.stride) { + return false; + } else if (lanes < other.lanes) { + return true; + } else if (lanes > other.lanes) { + return false; + } else if (scope < other.scope) { + return true; + } else if (scope > other.scope) { + return false; + } else if (allocation < other.allocation) { + return true; + } else if (allocation > other.allocation) { + return false; + } else if (type < other.type) { + return true; + } else if (other.type < type) { + return false; + } else if (buf < other.buf) { + return true; + } else if (buf > other.buf) { + return false; + } else { + return graph_less_than(base, other.base); + } + } + }; + // Entry entry maps from an offset from the base to a vector of identical + // Load nodes with that offset. + std::map>> found_loads; + + // The current scope over which accesses definitely occur. + const IRNode *scope = nullptr; + + Scope allocation_scope; + + std::map parent_scope; + +protected: + void visit(const Load *op) override { + if (is_const_one(op->predicate)) { + // We want to give ourselves the best possible chance at recognizing + // a naked Ramp, so we simplify and substitute in lets (and take + // care to treat the index expression as a graph until the next + // CSE). + Expr idx = substitute_in_all_lets(simplify(common_subexpression_elimination(op->index))); + if (const Ramp *r = idx.as()) { + const int64_t *stride_ptr = as_const_int(r->stride); + int64_t stride = stride_ptr ? *stride_ptr : 0; + Expr base = r->base; + int64_t offset = 0; + const Add *base_add = base.as(); + const int64_t *offset_ptr = base_add ? as_const_int(base_add->b) : nullptr; + if (offset_ptr) { + base = base_add->a; + offset = *offset_ptr; + } + // TODO: We do not yet handle nested vectorization here for + // ramps which have not already collapsed. We could potentially + // handle more interesting types of shuffle than simple flat slices. + if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) { + const IRNode *s = scope; + const Allocate *a = nullptr; + if (allocation_scope.contains(op->name)) { + a = allocation_scope.get(op->name); + } + found_loads[Key{op->name, base, stride, r->lanes, op->type, a, s}][offset].push_back(op); + } + } + } + IRVisitor::visit(op); + } + + void visit(const For *op) override { + if (can_prove(op->extent > 0)) { + // The loop body definitely runs + IRVisitor::visit(op); + } else { + const IRNode *child_scope = op->body.get(); + parent_scope[child_scope] = scope; + ScopedValue bind(scope, child_scope); + IRVisitor::visit(op); + } + } + + void visit(const IfThenElse *op) override { + op->condition.accept(this); + { + const IRNode *child_scope = op->then_case.get(); + parent_scope[child_scope] = scope; + ScopedValue bind(scope, child_scope); + op->then_case.accept(this); + } + if (op->else_case.defined()) { + const IRNode *child_scope = op->else_case.get(); + parent_scope[child_scope] = scope; + ScopedValue bind(scope, child_scope); + op->else_case.accept(this); + } + } + + void visit(const Allocate *op) override { + // Provide a mapping from load nodes to paddable allocations they belong + // to. + ScopedBinding bind(allocation_scope, op->name, op); + IRVisitor::visit(op); + } + + using IRVisitor::visit; +}; + +// Replace a bunch of load expressions in a stmt +class ReplaceStridedLoads : public IRMutator { +public: + std::map, Expr> replacements; + std::map padding; + Scope allocation_scope; + +protected: + Expr visit(const Load *op) override { + const Allocate *alloc = nullptr; + if (allocation_scope.contains(op->name)) { + alloc = allocation_scope.get(op->name); + } + auto it = replacements.find({alloc, op}); + if (it != replacements.end()) { + return mutate(it->second); + } else { + return IRMutator::visit(op); + } + } + + Stmt visit(const Allocate *op) override { + ScopedBinding bind(allocation_scope, op->name, op); + auto it = padding.find(op); + Stmt s = IRMutator::visit(op); + if (it == padding.end()) { + return s; + } else { + op = s.as(); + internal_assert(op); + return Allocate::make(op->name, op->type, op->memory_type, + op->extents, op->condition, + op->body, op->new_expr, op->free_function, + std::max(it->second, op->padding)); + } + } + + using IRMutator::visit; +}; + +} // namespace + +Stmt stage_strided_loads(const Stmt &s) { + FindStridedLoads finder; + ReplaceStridedLoads replacer; + + // Find related clusters of strided loads anywhere in the stmt. While this + // appears to look globally, it requires expressions to match exactly, so + // really it's only going to find things inside the same loops and let + // statements. + s.accept(&finder); + + for (const auto &l : finder.found_loads) { + const FindStridedLoads::Key &k = l.first; + const Allocate *alloc = k.allocation; + const std::map> &v = l.second; + + // Find clusters of strided loads that can share the same dense load. + for (auto load = v.begin(); load != v.end();) { + // If there is any other load at the same base at an offset at least + // stride-1 ahead, it's safe to do a big dense load. Note that we're + // assuming that it's always valid to load addresses between two + // valid addresses, which rules out games involving protected pages + // at the end of scanlines. + const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end(); + + if (!can_lift) { + load++; + continue; + } + + // We have a complete cluster of loads. Make a single dense load + int lanes = k.lanes * k.stride; + int64_t first_offset = load->first; + Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes); + Type t = k.type.with_lanes(lanes); + const Load *op = load->second[0]; + Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param, + const_true(lanes), op->alignment); + shared_load = common_subexpression_elimination(shared_load); + for (; load != v.end() && load->first < first_offset + k.stride; load++) { + Expr shuf = Shuffle::make_slice(shared_load, load->first - first_offset, k.stride, k.lanes); + for (const Load *l : load->second) { + replacer.replacements.emplace(std::make_pair(alloc, l), shuf); + } + } + } + + // Do the same in reverse to pick up any loads that didn't get + // picked up in a cluster, but for whom we know it's safe to do a + // dense load before their start. + for (auto load = v.rbegin(); load != v.rend(); load++) { + if (replacer.replacements.count({alloc, load->second[0]})) { + continue; + } + int64_t delta = k.stride - 1; + const bool can_lift = l.second.upper_bound(load->first - delta) != l.second.begin(); + if (!can_lift) { + continue; + } + int lanes = k.lanes * k.stride; + int64_t first_offset = load->first - delta; + Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes); + Type t = k.type.with_lanes(lanes); + const Load *op = load->second[0]; + Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param, + const_true(lanes), op->alignment - delta); + dense_load = common_subexpression_elimination(dense_load); + Expr shuf = Shuffle::make_slice(dense_load, delta, k.stride, k.lanes); + for (const Load *l : load->second) { + replacer.replacements.emplace(std::make_pair(alloc, l), shuf); + } + } + + // Look for any loads we can densify because an overlapping load occurs + // in any parent scope. + for (auto load = v.rbegin(); load != v.rend(); load++) { + if (replacer.replacements.count({alloc, load->second[0]})) { + continue; + } + int64_t min_offset = load->first; + int64_t max_offset = load->first; + const IRNode *scope = k.scope; + while (scope) { + const IRNode *parent = finder.parent_scope[scope]; + auto parent_key = k; + parent_key.scope = parent; + auto it = finder.found_loads.find(parent_key); + if (it != finder.found_loads.end() && !it->second.empty()) { + min_offset = std::min(it->second.begin()->first, min_offset); + max_offset = std::max(it->second.rbegin()->first, max_offset); + } + scope = parent; + } + + if (max_offset - min_offset < k.stride - 1) { + continue; + } + int64_t offset = std::max(load->first - (k.stride - 1), min_offset); + int lanes = k.lanes * k.stride; + Expr idx = Ramp::make(k.base + (int)offset, make_one(k.base.type()), lanes); + Type t = k.type.with_lanes(lanes); + const Load *op = load->second[0]; + Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param, + const_true(lanes), op->alignment); + dense_load = common_subexpression_elimination(dense_load); + Expr shuf = Shuffle::make_slice(dense_load, load->first - offset, k.stride, k.lanes); + for (const Load *l : load->second) { + replacer.replacements.emplace(std::make_pair(alloc, l), shuf); + } + } + + // Densify any remaining strided loads to internal allocations by + // padding the allocation, and densify any remaining strided loads to + // external allocations by doing a dense load at a trimmed size. We rely + // on codegen to do a good job at loading vectors of a funny size. + for (auto load = v.begin(); load != v.end(); load++) { // NOLINT + if (replacer.replacements.count({alloc, load->second[0]})) { + continue; + } + + int lanes = k.lanes * k.stride; + + bool may_pad = k.allocation && !k.allocation->new_expr.defined(); + int delta = (int)(k.stride - 1); + + if (may_pad) { + auto p = replacer.padding.insert({k.allocation, delta}); + if (!p.second) { + p.first->second = std::max(p.first->second, delta); + } + + int64_t first_offset = load->first; + Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes); + Type t = k.type.with_lanes(lanes); + const Load *op = load->second[0]; + Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param, + const_true(lanes), op->alignment); + dense_load = common_subexpression_elimination(dense_load); + Expr shuf = Shuffle::make_slice(dense_load, load->first - first_offset, k.stride, k.lanes); + for (const Load *l : load->second) { + replacer.replacements.emplace(std::make_pair(alloc, l), shuf); + } + + } else if (k.lanes % 2 == 0) { + // Do two overlapping half-sized dense loads and mush them together. + int64_t first_offset = load->first; + int half_lanes = lanes / 2; + internal_assert(delta <= half_lanes); + Expr idx1 = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), half_lanes); + + Expr idx2 = Ramp::make(k.base + (int)first_offset + half_lanes - delta, make_one(k.base.type()), half_lanes); + Type t = k.type.with_lanes(half_lanes); + const Load *op = load->second[0]; + Expr dense_load1 = Load::make(t, k.buf, idx1, op->image, op->param, + const_true(half_lanes), op->alignment); + Expr dense_load2 = Load::make(t, k.buf, idx2, op->image, op->param, + const_true(half_lanes), op->alignment + half_lanes - delta); + dense_load1 = common_subexpression_elimination(dense_load1); + dense_load2 = common_subexpression_elimination(dense_load2); + Expr shuf1 = Shuffle::make_slice(dense_load1, 0, k.stride, k.lanes / 2); + Expr shuf2 = Shuffle::make_slice(dense_load2, delta, k.stride, k.lanes / 2); + Expr shuf = Shuffle::make_concat({shuf1, shuf2}); + for (const Load *l : load->second) { + replacer.replacements.emplace(std::make_pair(alloc, l), shuf); + } + } + } + } + + return replacer.mutate(s); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h new file mode 100644 index 000000000000..a29cef2438f1 --- /dev/null +++ b/src/StageStridedLoads.h @@ -0,0 +1,45 @@ +#ifndef HALIDE_INTERNAL_STAGE_STRIDED_LOADS_H +#define HALIDE_INTERNAL_STAGE_STRIDED_LOADS_H + +/** \file + * + * Defines the compiler pass that converts strided loads into dense loads + * followed by shuffles. + */ + +#include "Expr.h" + +namespace Halide { +namespace Internal { + +/** Convert all unpredicated strided loads in a Stmt into dense loads followed + * by shuffles. + * + * For a stride of two, the trick is to do a dense load of twice the size, and + * then extract either the even or odd lanes. This was previously done in + * codegen, where it was challenging, because it's not easy to know there if + * it's safe to do the double-sized load, as it either loads one element beyond + * or before the original load. We used the alignment of the ramp base to try to + * tell if it was safe to shift backwards, and we added padding to internal + * allocations so that for those at least it was safe to shift + * forwards. Unfortunately the alignment of the ramp base is usually unknown if + * you don't know anything about the strides of the input, and adding padding to + * allocations was a serious wart in our memory allocators. + * + * This pass instead actively looks for evidence elsewhere in the Stmt (at some + * location which definitely executes whenever the load being transformed + * executes) that it's safe to read further forwards or backwards in memory. The + * evidence is in the form of a load at the same base address with a different + * constant offset. It also clusters groups of these loads so that they do the + * same dense load and extract the appropriate slice of lanes. If it fails to + * find any evidence, for loads from external buffers it does two overlapping + * half-sized dense loads and shuffles out the desired lanes, and for loads from + * internal allocations it adds padding to the allocation explicitly, by setting + * the padding field on Allocate nodes. + */ +Stmt stage_strided_loads(const Stmt &s); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index 814d5e4c6a83..2916fa91a3ae 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -446,7 +446,7 @@ class PromoteToMemoryType : public IRMutator { if (t != op->type) { return Allocate::make(op->name, t, op->memory_type, mutate(op->extents), mutate(op->condition), mutate(op->body), - mutate(op->new_expr), op->free_function); + mutate(op->new_expr), op->free_function, op->padding); } else { return IRMutator::visit(op); } diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp index 332507becf6d..b0deede5b6fe 100644 --- a/src/UnrollLoops.cpp +++ b/src/UnrollLoops.cpp @@ -6,6 +6,7 @@ #include "Simplify.h" #include "SimplifyCorrelatedDifferences.h" #include "Substitute.h" +#include "UniquifyVariableNames.h" using std::pair; using std::vector; @@ -124,7 +125,9 @@ class UnrollLoops : public IRMutator { } // namespace Stmt unroll_loops(const Stmt &s) { - return UnrollLoops().mutate(s); + Stmt stmt = UnrollLoops().mutate(s); + // Unrolling duplicates variable names. Other passes assume variable names are unique. + return uniquify_variable_names(stmt); } } // namespace Internal diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 7dcd79d24664..91e958c9e196 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -1062,6 +1062,12 @@ class VectorSubs : public IRMutator { body = substitute(vv.name + ".from_zero", Variable::make(Int(32), vv.name), body); } + // Difficult to tell how the padding should grow when vectorizing an + // allocation. It's not currently an issue, because vectorization + // happens before the only source of padding (lowering strided + // loads). Add an assert to enforce it. + internal_assert(op->padding == 0) << "Vectorization of padded allocations not yet implemented"; + return Allocate::make(op->name, op->type, op->memory_type, new_extents, op->condition, body, new_expr, op->free_function); } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 0e7415a3b0c8..dca5ac9fb2b1 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -287,6 +287,7 @@ tests(GROUPS correctness split_reuse_inner_name_bug.cpp split_store_compute.cpp stack_allocations.cpp + stage_strided_loads.cpp stencil_chain_in_update_definitions.cpp stmt_to_html.cpp storage_folding.cpp diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index 75088816031a..e096bc262b1b 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -291,35 +291,61 @@ class SimdOpCheckARM : public SimdOpCheckTest { check(arm32 ? "vld1.32" : "ldr", 2 * w, in_f32(x + y)); } - // VLD2 X - Load Two-Element Structures - // These need to be vectorized at least 2 native vectors wide, - // so we get a full vectors' worth that we know is safe to - // access. - check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1)); - check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1)); - - // VLD3 X - Load Three-Element Structures - check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3)); - check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3)); - check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3)); - check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3)); - check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3)); - check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3)); - check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3)); - - // VLD4 X - Load Four-Element Structures - check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4)); - check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4)); - check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4)); - check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4)); - check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4)); - check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4)); - check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4)); + if (target.os != Target::IOS && target.os != Target::OSX) { + // VLD* are not profitable on Apple silicon + + // VLD2 X - Load Two-Element Structures + // These need to be vectorized at least 2 native vectors wide, + // so we get a full vectors' worth that we know is safe to + // access. + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2)); + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2)); + + // VLD3 X - Load Three-Element Structures + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3)); + + // VLD4 X - Load Four-Element Structures + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4)); + } else if (!arm32) { + // On Apple Silicon we expect dense loads followed by shuffles. + check("uzp1.16b", 32 * w, in_i8(x * 2)); + check("uzp1.16b", 32 * w, in_u8(x * 2)); + check("uzp1.8h", 16 * w, in_i16(x * 2)); + check("uzp1.8h", 16 * w, in_u16(x * 2)); + check("uzp1.4s", 8 * w, in_i32(x * 2)); + check("uzp1.4s", 8 * w, in_u32(x * 2)); + check("uzp1.4s", 8 * w, in_f32(x * 2)); + + // VLD3 X - Load Three-Element Structures + check("tbl.16b", 32 * w, in_i8(x * 3)); + check("tbl.16b", 32 * w, in_u8(x * 3)); + check("tbl.16b", 16 * w, in_i16(x * 3)); + check("tbl.16b", 16 * w, in_u16(x * 3)); + // For 32-bit types llvm just scalarizes + + // VLD4 X - Load Four-Element Structures + check("tbl.16b", 32 * w, in_i8(x * 4)); + check("tbl.16b", 32 * w, in_u8(x * 4)); + check("tbl.16b", 16 * w, in_i16(x * 4)); + check("tbl.16b", 16 * w, in_u16(x * 4)); + } // VLDM X F, D Load Multiple Registers // VLDR X F, D Load Single Register diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp new file mode 100644 index 000000000000..f791385f7c25 --- /dev/null +++ b/test/correctness/stage_strided_loads.cpp @@ -0,0 +1,264 @@ +#include "Halide.h" + +using namespace Halide; +using namespace Halide::Internal; + +class CheckForStridedLoads : public IRMutator { + using IRMutator::visit; + + Expr visit(const Load *op) override { + if (const Ramp *r = op->index.as()) { + if (op->name == buf_name) { + bool dense = is_const_one(r->stride); + found |= !dense; + dense_loads += dense; + } + } + return IRMutator::visit(op); + } + +public: + bool found = false; + int dense_loads = 0; + std::string buf_name; + + void check(Func f, int desired_dense_loads, std::string name = "buf") { + found = false; + dense_loads = 0; + buf_name = name; + f.add_custom_lowering_pass(this, nullptr); + f.compile_jit(); + assert(!found); + assert(dense_loads == desired_dense_loads); + } + + void check_not(Func f, int desired_dense_loads, std::string name = "buf") { + found = false; + dense_loads = 0; + buf_name = name; + f.add_custom_lowering_pass(this, nullptr); + f.compile_jit(); + assert(found); + assert(dense_loads == desired_dense_loads); + } +} checker; + +int main(int argc, char **argv) { + ImageParam buf(Float(32), 1, "buf"); + + { + // Clusters of strided loads can share the same underlying dense load + Func f; + Var x; + f(x) = buf(2 * x) + buf(2 * x + 1); + f.vectorize(x, 8, TailStrategy::RoundUp); + + // We expect no strided loads, and one dense load + checker.check(f, 1); + } + + { + Func f; + Var x; + f(x) = buf(2 * x) + buf(2 * x + 1) + buf(2 * x + 2); + f.vectorize(x, 8, TailStrategy::RoundUp); + + // We expect two dense loads in this case. One for the first two taps, + // and one for the last tap. + checker.check(f, 2); + } + + { + // Check four taps + Func f; + Var x; + f(x) = (buf(2 * x) + buf(2 * x + 2)) + (buf(2 * x + 1) + buf(2 * x + 3)); + f.vectorize(x, 8, TailStrategy::RoundUp); + + checker.check(f, 2); + } + + { + // Check tuples + Func f; + Var x; + f(x) = {0.f, 0.f}; + f(x) += {buf(2 * x), buf(2 * x + 1)}; + f.update().vectorize(x, 8, TailStrategy::RoundUp); + + // In this case, the dense load appears twice across the two store + // statements for the two tuple components, but it will get deduped by + // llvm. + checker.check(f, 2); + } + + { + // Far apart constant offsets is still enough evidence that it's safe to + // do a dense load. + Func f; + Var x; + f(x) = buf(2 * x - 123) + buf(2 * x + 134); + f.vectorize(x, 8, TailStrategy::RoundUp); + + checker.check(f, 2); + } + + { + // Load partners can be split across multiple Funcs in the same block + Func f, g; + Var x; + f(x) = buf(2 * x); + g(x) = f(x) + buf(2 * x + 1); + + g.vectorize(x, 8, TailStrategy::RoundUp); + f.compute_at(g, x).vectorize(x); + + checker.check(g, 2); + } + + { + // Load partners can be split across update definitions + Func f, g; + Var x; + f(x) = buf(2 * x); + f(x) += buf(2 * x + 1); + g(x) = f(x); + g.vectorize(x, 8, TailStrategy::RoundUp); + + checker.check(g, 2); + } + + { + // Load clusters can be split across an unrolled variable + Func f, g; + Var x, c; + f(x, c) = buf(4 * x + c) + 4 * x; + f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 4).unroll(c).reorder(c, x); + + checker.check(f, 4); + } + + { + // Load clusters can even be split across unrelated inner loop nests + // (provided they are known to have non-zero extent). + + Func f, g, h; + Var c, x, y; + g(x, y) = buf(2 * x) + y; + h(x, y) = buf(2 * x + 1) + y; + f(x, y, c) = g(x, y) + h(x, y) + c; + + Var xi, yi; + f.tile(x, y, xi, yi, 8, 8, TailStrategy::RoundUp).vectorize(xi).reorder(c, x, y); + g.compute_at(f, x).vectorize(x); + h.compute_at(f, x).vectorize(x); + checker.check(f, 2); + } + + // We can always densify strided loads to internal allocations, because we + // can just pad the allocation. + { + Func f, g; + Var x; + + f(x) = x; + g(x) = f(2 * x); + f.compute_at(g, x).vectorize(x); + g.vectorize(x, 8, TailStrategy::RoundUp); + checker.check(g, 1, f.name()); + } + + // Strides up to the the vector size are worth densifying. After that, it's better to just gather. + { + Func f; + Var x; + f(x) = buf(15 * x) + buf(15 * x + 14); + f.vectorize(x, 16, TailStrategy::RoundUp); + + checker.check(f, 1); + } + + { + Func f; + Var x; + f(x) = buf(16 * x) + buf(16 * x + 15); + f.vectorize(x, 16, TailStrategy::RoundUp); + + checker.check_not(f, 0); + } + + // Strided loads to external allocations are handled by doing a weird-sized + // dense load and then shuffling. + { + Func f; + Var x; + f(x) = buf(3 * x); + f.vectorize(x, 8, TailStrategy::RoundUp); + checker.check(f, 2); + } + + // Make a pair of unconditionally-executed loads, and check that a + // conditionally-executed load can use it as evidence that a dense load in + // one direction or the other is safe to do. + { + Func f; + Var x; + f(x) = buf(2 * x) + buf(2 * x + 1); + RDom r1(0, 1), r2(0, 1); + Param p1, p2; + r1.where(p1); + r2.where(p2); + f(x) += buf(2 * x + 3) + r1; + f(x) += buf(2 * x - 3) + r2; + + Func g; + g(x) = f(x); + g.vectorize(x, 8, TailStrategy::RoundUp); + f.compute_at(g, x).vectorize(x); + f.update(0).vectorize(x); + f.update(1).vectorize(x); + + checker.check(g, 3); + } + + // Make a case that uses nested vectorization. + { + Func f; + Var x, c; + + f(c, x) = buf(2 * (2 * x + c)) + buf(2 * (2 * x + c) + 1); + f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 2).vectorize(c); + f.output_buffer().dim(1).set_stride(2); + checker.check(f, 1); + } + + // Do a variety of weird loads at weird sizes from an external buffer to + // test the behaviour that does two half-sized loads. + { + Buffer data(1024); + Buffer out(512); + for (int i = 0; i < 1024; i++) { + data(i) = i; + } + buf.set(data); + for (int size = 2; size <= 16; size += 2) { + for (int stride = 2; stride <= 8; stride++) { + Func f; + Var x; + f(x) = buf(stride * x); + f.vectorize(x, size); + + Buffer out = f.realize({1024 / stride}); + for (int i = 0; i < out.width(); i++) { + if (out(i) != data(stride * i)) { + printf("out(%d) = %f instead of %f\n", i, out(i), data(stride * i)); + } + } + } + } + } + + printf("Success!\n"); + + return 0; +} diff --git a/test/performance/stack_vs_heap.cpp b/test/performance/stack_vs_heap.cpp index d5b42c4df8d1..e5314f77897d 100644 --- a/test/performance/stack_vs_heap.cpp +++ b/test/performance/stack_vs_heap.cpp @@ -11,7 +11,9 @@ int main(int argc, char **argv) { } double times[3] = {0.f, 0.f, 0.f}; - for (int i = 0; i < 3; i++) { + + for (int sz = 1; sz < 32; sz = sz * 2 + 1) { + for (int c = 0; c < 3; c++) { MemoryType mem_type; bool use_bound; @@ -31,12 +33,19 @@ int main(int argc, char **argv) { use_bound = false; } - Func f; Var x, y; - f(x, y) = x / 18.3f + y; + + std::vector fs; + Expr e = 0.0f; + for (int j = 0; j < 10; j++) { + Func f; + f(x, y) = x * j + y; + e += f(x, y); + fs.push_back(f); + } Func g; - g(x, y) = f(x, y) + f(x, y + 1); + g(x, y) = e; Var yo, yi; // Place the y loop body in its own function with its own @@ -44,15 +53,16 @@ int main(int argc, char **argv) { // which will be 1 in practice. Param task_size; g.split(y, yo, yi, task_size).parallel(yi); - f.compute_at(g, yi).store_in(mem_type); - - if (use_bound) { - f.bound_extent(x, 8); + for (auto f : fs) { + f.compute_at(g, yi).store_in(mem_type); + if (use_bound) { + f.bound_extent(x, sz); + } } - Buffer out(8, 1024); + Buffer out(sz, 1024); task_size.set(1); - double t = 1e3 * Tools::benchmark(10, 100, [&]() { + double t = 1e3 * Tools::benchmark(10, 1 + 100 / sz, [&]() { g.realize(out); }); times[c] += t;