diff --git a/Makefile b/Makefile
index 9f2e1f3cf009..5c474187e7cd 100644
--- a/Makefile
+++ b/Makefile
@@ -559,6 +559,7 @@ SOURCE_FILES = \
   Solve.cpp \
   SpirvIR.cpp \
   SplitTuples.cpp \
+  StageStridedLoads.cpp \
   StmtToHtml.cpp \
   StorageFlattening.cpp \
   StorageFolding.cpp \
@@ -725,6 +726,7 @@ HEADER_FILES = \
   SlidingWindow.h \
   Solve.h \
   SplitTuples.h \
+  StageStridedLoads.h \
   StmtToHtml.h \
   StorageFlattening.h \
   StorageFolding.h \
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index ee6e7dc09c57..9c43105a18c4 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -227,9 +227,9 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
     Func downsample(Func f) {
         using Halide::_;
         Func downx, downy;
-        downx(x, y, _) = (f(2 * x - 1, y, _) + 3.0f * (f(2 * x, y, _) + f(2 * x + 1, y, _)) + f(2 * x + 2, y, _)) / 8.0f;
-        downy(x, y, _) = (downx(x, 2 * y - 1, _) + 3.0f * (downx(x, 2 * y, _) + downx(x, 2 * y + 1, _)) + downx(x, 2 * y + 2, _)) / 8.0f;
-        return downy;
+        downy(x, y, _) = (f(x, 2 * y - 1, _) + 3.0f * (f(x, 2 * y, _) + f(x, 2 * y + 1, _)) + f(x, 2 * y + 2, _)) / 8.0f;
+        downx(x, y, _) = (downy(2 * x - 1, y, _) + 3.0f * (downy(2 * x, y, _) + downy(2 * x + 1, y, _)) + downy(2 * x + 2, y, _)) / 8.0f;
+        return downx;
     }
 
     // Upsample using bilinear interpolation
diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp
index 08debac9e9e7..a2bf990e38f6 100644
--- a/src/AddAtomicMutex.cpp
+++ b/src/AddAtomicMutex.cpp
@@ -338,7 +338,8 @@ class AddAtomicMutex : public IRMutator {
                               op->condition,
                               std::move(body),
                               op->new_expr,
-                              op->free_function);
+                              op->free_function,
+                              op->padding);
     }
 
     Stmt visit(const ProducerConsumer *op) override {
diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index 3f64e4b1ab34..00f27603d971 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -80,7 +80,7 @@ class NoOpCollapsingMutator : public IRMutator {
         } else {
             return Allocate::make(op->name, op->type, op->memory_type,
                                   op->extents, op->condition, body,
-                                  op->new_expr, op->free_function);
+                                  op->new_expr, op->free_function, op->padding);
         }
     }
 
diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
index ae492d00d908..f6a86f8a3e2a 100644
--- a/src/BoundSmallAllocations.cpp
+++ b/src/BoundSmallAllocations.cpp
@@ -140,7 +140,7 @@ class BoundSmallAllocations : public IRMutator {
         if (size_ptr && size == 0 && !op->new_expr.defined()) {
             // This allocation is dead
             return Allocate::make(op->name, op->type, op->memory_type, {0}, const_false(),
-                                  mutate(op->body), op->new_expr, op->free_function);
+                                  mutate(op->body), op->new_expr, op->free_function, op->padding);
         }
 
         // 128 bytes is a typical minimum allocation size in
@@ -155,7 +155,7 @@ class BoundSmallAllocations : public IRMutator {
             user_assert(size >= 0 && size < (int64_t)1 << 31)
                 << "Allocation " << op->name << " has a size greater than 2^31: " << bound << "\n";
             return Allocate::make(op->name, op->type, op->memory_type, {(int32_t)size}, op->condition,
-                                  mutate(op->body), op->new_expr, op->free_function);
+                                  mutate(op->body), op->new_expr, op->free_function, op->padding);
         } else {
             return IRMutator::visit(op);
         }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cc9f6805ba4a..4ce6105ecc5f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -145,6 +145,7 @@ set(HEADER_FILES
     SlidingWindow.h
     Solve.h
     SplitTuples.h
+    StageStridedLoads.h
     StmtToHtml.h
     StorageFlattening.h
     StorageFolding.h
@@ -324,6 +325,7 @@ set(SOURCE_FILES
     Solve.cpp
     SpirvIR.cpp
     SplitTuples.cpp
+    StageStridedLoads.cpp
     StmtToHtml.cpp
     StorageFlattening.cpp
     StorageFolding.cpp
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 15f15d7142ec..7ffbc3b79b5e 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -8,6 +8,7 @@
 #include "Debug.h"
 #include "IREquality.h"
 #include "IRMatch.h"
+#include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "LLVM_Headers.h"
@@ -30,6 +31,71 @@ using namespace llvm;
 
 namespace {
 
+// Substitute in loads that feed into slicing shuffles, to help with vld2/3/4
+// emission. These are commonly lifted as lets because they get used by multiple
+// interleaved slices of the same load.
+class SubstituteInStridedLoads : public IRMutator {
+    Scope<Expr> loads;
+    std::map<std::string, std::vector<std::string>> vars_per_buffer;
+    std::set<std::string> poisoned_vars;
+
+    template<typename LetOrLetStmt>
+    auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
+        const Load *l = op->value.template as<Load>();
+        const Ramp *r = l ? l->index.as<Ramp>() : nullptr;
+        auto body = op->body;
+        if (r && is_const_one(r->stride)) {
+            ScopedBinding bind(loads, op->name, op->value);
+            vars_per_buffer[l->name].push_back(op->name);
+            body = mutate(op->body);
+            vars_per_buffer[l->name].pop_back();
+            poisoned_vars.erase(l->name);
+        } else {
+            body = mutate(op->body);
+        }
+
+        // Unconditionally preserve the let, because there may be unsubstituted uses of
+        // it. It'll get dead-stripped by LLVM if not.
+        return LetOrLetStmt::make(op->name, op->value, body);
+    }
+
+    Expr visit(const Let *op) override {
+        return visit_let(op);
+    }
+
+    Stmt visit(const LetStmt *op) override {
+        return visit_let(op);
+    }
+
+    // Avoid substituting a load over an intervening store
+    Stmt visit(const Store *op) override {
+        auto it = vars_per_buffer.find(op->name);
+        if (it != vars_per_buffer.end()) {
+            for (const auto &v : it->second) {
+                poisoned_vars.insert(v);
+            }
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        int stride = op->slice_stride();
+        const Variable *var = op->vectors[0].as<Variable>();
+        if (var &&
+            poisoned_vars.count(var->name) == 0 &&
+            op->vectors.size() == 1 &&
+            2 <= stride && stride <= 4 &&
+            op->slice_begin() < stride &&
+            loads.contains(var->name)) {
+            return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes());
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    using IRMutator::visit;
+};
+
 /** A code generator that emits ARM code from a given Halide stmt. */
 class CodeGen_ARM : public CodeGen_Posix {
 public:
@@ -42,7 +108,10 @@ class CodeGen_ARM : public CodeGen_Posix {
     /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
      * takes one vector argument and splits it into two to call inner. */
     llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);
+
     void init_module() override;
+    void compile_func(const LoweredFunc &f,
+                      const std::string &simple_name, const std::string &extern_name) override;
 
     /** Nodes for which we want to emit specific neon intrinsics */
     // @{
@@ -52,6 +121,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Max *) override;
     void visit(const Store *) override;
     void visit(const Load *) override;
+    void visit(const Shuffle *) override;
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
@@ -770,6 +840,22 @@ void CodeGen_ARM::init_module() {
     }
 }
 
+void CodeGen_ARM::compile_func(const LoweredFunc &f,
+                               const string &simple_name,
+                               const string &extern_name) {
+
+    LoweredFunc func = f;
+
+    if (target.os != Target::IOS && target.os != Target::OSX) {
+        // Substitute in strided loads to get vld2/3/4 emission. We don't do it
+        // on Apple silicon, because doing a dense load and then shuffling is
+        // actually faster.
+        func.body = SubstituteInStridedLoads().mutate(func.body);
+    }
+
+    CodeGen_Posix::compile_func(func, simple_name, extern_name);
+}
+
 void CodeGen_ARM::visit(const Cast *op) {
     if (!neon_intrinsics_disabled() && op->type.is_vector()) {
         vector<Expr> matches;
@@ -1092,9 +1178,9 @@ void CodeGen_ARM::visit(const Load *op) {
         return;
     }
 
-    // If the stride is in [-1, 4], we can deal with that using vanilla codegen
+    // If the stride is in [-1, 1], we can deal with that using vanilla codegen
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
-    if (stride && (-1 <= stride->value && stride->value <= 4)) {
+    if (stride && (-1 <= stride->value && stride->value <= 1)) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1123,6 +1209,29 @@ void CodeGen_ARM::visit(const Load *op) {
     CodeGen_Posix::visit(op);
 }
 
+void CodeGen_ARM::visit(const Shuffle *op) {
+    // For small strided loads on non-Apple hardware, we may want to use vld2,
+    // vld3, vld4, etc. These show up in the IR as slice shuffles of wide dense
+    // loads. LLVM expects the same. The base codegen class breaks the loads
+    // into native vectors, which triggers shuffle instructions rather than
+    // vld2, vld3, vld4. So here we explicitly do the load as a single big dense
+    // load.
+    int stride = op->slice_stride();
+    const Load *load = op->vectors[0].as<Load>();
+    if (target.os != Target::IOS && target.os != Target::OSX &&
+        load &&
+        op->vectors.size() == 1 &&
+        2 <= stride && stride <= 4 &&
+        op->slice_begin() < stride &&
+        load->type.lanes() == stride * op->type.lanes()) {
+
+        value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
+        value = shuffle_vectors(value, op->indices);
+    } else {
+        CodeGen_Posix::visit(op);
+    }
+}
+
 void CodeGen_ARM::visit(const Call *op) {
     if (op->is_intrinsic(Call::sorted_avg)) {
         value = codegen(halving_add(op->args[0], op->args[1]));
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index c7ab5a45cf23..b7df84b80ae5 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -596,8 +596,8 @@ class CppVectorOps {
         }
     }
 
-    template<int... Indices>
-    static Vec shuffle(const Vec &a) {
+    template<int... Indices, typename InputVec>
+    static Vec shuffle(const InputVec &a) {
         static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
         Vec r = { a[Indices]... };
         return r;
@@ -1129,15 +1129,14 @@ class NativeVectorOps {
         }
     }
 
-    template<int... Indices>
-    static Vec shuffle(const Vec a) {
+    template<int... Indices, typename InputVec>
+    static Vec shuffle(const InputVec a) {
         static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
 #if __has_builtin(__builtin_shufflevector)
-        // Clang
+        // Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't
+        // be used, because it can't handle changing the number of vector
+        // lanes between input and output.
         return __builtin_shufflevector(a, a, Indices...);
-#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__)
-        // GCC
-        return __builtin_shuffle(a, NativeVector<int, sizeof...(Indices)>{Indices...});
 #else
         Vec r = { a[Indices]... };
         return r;
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index c84e8641770b..578b505b6932 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -1059,7 +1059,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
 
                 Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents,
                                                 std::move(new_condition), std::move(new_body),
-                                                std::move(new_new_expr), op->free_function);
+                                                std::move(new_new_expr), op->free_function, op->padding);
 
                 allocs.push_back(new_alloc);
                 replacements.erase(op->name);
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index b37d5990b6ba..87f732dc0d54 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -70,12 +70,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
     void visit(const Allocate *) override;
     ///@}
 
-    /** We ask for an extra vector on each allocation to enable fast
-     * clamped ramp loads. */
-    int allocation_padding(Type type) const override {
-        return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8;
-    }
-
     /** Call an LLVM intrinsic, potentially casting the operands to
      * match the type of the function. */
     ///@{
@@ -123,7 +117,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
      * list of its extents and its size. Fires a runtime assert
      * (halide_error) if the size overflows 2^31 -1, the maximum
      * positive number an int32_t can hold. */
-    llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents);
+    llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, int padding);
 
     /** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */
     llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255);
@@ -2099,7 +2093,8 @@ void CodeGen_Hexagon::visit(const Select *op) {
 }
 
 Value *CodeGen_Hexagon::codegen_cache_allocation_size(
-    const std::string &name, Type type, const std::vector<Expr> &extents) {
+    const std::string &name, Type type,
+    const std::vector<Expr> &extents, int padding) {
     // Compute size from list of extents checking for overflow.
 
     Expr overflow = make_zero(UInt(32));
@@ -2131,6 +2126,9 @@ Value *CodeGen_Hexagon::codegen_cache_allocation_size(
         // is still an 8-bit number.
         overflow = overflow | (total_size_hi >> 24);
     }
+    int padding_bytes = padding * type.bytes();
+    overflow = overflow | (total_size + padding_bytes < total_size);
+    total_size += padding_bytes;
 
     Expr max_size = make_const(UInt(32), target.maximum_buffer_size());
     Expr size_check = (overflow == 0) && (total_size <= max_size);
@@ -2169,7 +2167,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
             llvm_size = codegen(Expr(constant_bytes));
         } else {
             llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type,
-                                                      alloc->extents);
+                                                      alloc->extents, alloc->padding);
         }
 
         // Only allocate memory if the condition is true, otherwise 0.
@@ -2262,13 +2260,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
         for (const auto &extent : alloc->extents) {
             size *= extent;
         }
-        size += allocation_padding(alloc->type);
+        size += alloc->padding * alloc->type.bytes();
         Expr new_expr =
             Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern);
         string free_function = "halide_vtcm_free";
         Stmt new_alloc = Allocate::make(
             alloc->name, alloc->type, alloc->memory_type, alloc->extents,
-            alloc->condition, alloc->body, new_expr, free_function);
+            alloc->condition, alloc->body, new_expr, free_function, alloc->padding);
         new_alloc.accept(this);
     } else {
         // For all other memory types
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 425859a4f7a5..5d8401620571 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2041,73 +2041,6 @@ void CodeGen_LLVM::visit(const Load *op) {
         llvm::Type *load_type = llvm_type_of(op->type.element_of());
         if (ramp && stride && stride->value == 1) {
             value = codegen_dense_vector_load(op);
-        } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
-            // Try to rewrite strided loads as shuffles of dense loads,
-            // aligned to the stride. This makes adjacent strided loads
-            // share the same underlying dense loads.
-            Expr base = ramp->base;
-            // The variable align will track the alignment of the
-            // base. Every time we change base, we also need to update
-            // align.
-            ModulusRemainder align = op->alignment;
-
-            int aligned_stride = gcd(stride->value, align.modulus);
-            int offset = 0;
-            if (aligned_stride == stride->value) {
-                offset = mod_imp((int)align.remainder, aligned_stride);
-            } else {
-                const Add *add = base.as<Add>();
-                if (const IntImm *add_c = add ? add->b.as<IntImm>() : base.as<IntImm>()) {
-                    offset = mod_imp(add_c->value, stride->value);
-                }
-            }
-
-            if (offset) {
-                base = simplify(base - offset);
-                align.remainder = mod_imp(align.remainder - offset, align.modulus);
-            }
-
-            // We want to load a few more bytes than the original load did.
-            // We know this is safe for internal buffers because we allocate
-            // padding.
-            // (In ASAN mode, don't read beyond the end of internal buffers either,
-            // as ASAN will complain even about harmless stack overreads.)
-            // The min moves lower by offset.
-            int load_lanes = ramp->lanes * stride->value;
-            bool external = op->param.defined() || op->image.defined();
-            if (external || target.has_feature(Target::ASAN)) {
-                load_lanes -= (stride->value - 1 - offset);
-            }
-
-            int slice_lanes = native_vector_bits() / op->type.bits();
-
-            // We're going to add multiples of slice_lanes to base in
-            // the loop below, so reduce alignment modulo slice_lanes.
-            align.modulus = gcd(align.modulus, slice_lanes);
-            align.remainder = mod_imp(align.remainder, align.modulus);
-
-            // We need to slice the result in to native vector lanes, otherwise
-            // LLVM misses optimizations like using ldN on ARM.
-            vector<Value *> results;
-            for (int i = 0; i < op->type.lanes(); i += slice_lanes) {
-                int load_base_i = i * stride->value;
-                int load_lanes_i = std::min<int>(slice_lanes * stride->value, load_lanes - load_base_i);
-                int lanes_i = std::min<int>(slice_lanes, op->type.lanes() - i);
-                Expr slice_base = simplify(base + load_base_i);
-
-                Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
-                                                    op->image, op->param, align, /*vpred=*/nullptr,
-                                                    /*slice_to_native=*/false);
-
-                std::vector<int> constants;
-                for (int j = 0; j < lanes_i; j++) {
-                    constants.push_back(j * stride->value + offset);
-                }
-                results.push_back(shuffle_vectors(load_i, constants));
-            }
-
-            // Concat the results
-            value = concat_vectors(results);
         } else if (ramp && stride && stride->value == -1) {
             // Load the vector and then flip it in-place
             Expr flipped_base = ramp->base - ramp->lanes + 1;
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index b132dac1d314..8128f3791469 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -630,6 +630,11 @@ class CodeGen_LLVM : public IRVisitor {
     bool use_llvm_vp_intrinsics;
     // @}
 
+    /** Generate a basic dense vector load, with an optional predicate and
+     * control over whether or not we should slice the load into native
+     * vectors. Used by CodeGen_ARM to help with vld2/3/4 emission. */
+    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);
+
 private:
     /** All the values in scope at the current code location during
      * codegen. Use sym_push and sym_pop to access. */
@@ -674,7 +679,6 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base,
                                      const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
                                      llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr);
-    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);
 
     virtual void codegen_predicated_load(const Load *op);
     virtual void codegen_predicated_store(const Store *op);
diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp
index 3597c035befe..af508194b06e 100644
--- a/src/CodeGen_Posix.cpp
+++ b/src/CodeGen_Posix.cpp
@@ -74,22 +74,15 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type
     return codegen(total_size);
 }
 
-int CodeGen_Posix::allocation_padding(Type type) const {
-    // We potentially load 3 scalar values past the end of the
-    // buffer, so pad the allocation with an extra instance of the
-    // scalar type.
-    return 3 * type.bytes();
-}
-
 CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type,
                                                            const std::vector<Expr> &extents, const Expr &condition,
-                                                           const Expr &new_expr, std::string free_function) {
+                                                           const Expr &new_expr, std::string free_function, int padding) {
     Value *llvm_size = nullptr;
     int64_t stack_bytes = 0;
     int32_t constant_bytes = Allocate::constant_allocation_size(extents, name);
     if (constant_bytes > 0) {
         constant_bytes *= type.bytes();
-        stack_bytes = constant_bytes;
+        stack_bytes = constant_bytes + padding * type.bytes();
 
         if (stack_bytes > target.maximum_buffer_size()) {
             const string str_max_size = target.has_large_buffers() ? "2^63 - 1" : "2^31 - 1";
@@ -117,8 +110,8 @@ CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &na
         // Add the requested padding to the allocation size. If the
         // allocation is on the stack, we can just read past the top
         // of the stack, so we only need this for heap allocations.
-        Value *padding = ConstantInt::get(llvm_size->getType(), allocation_padding(type));
-        llvm_size = builder->CreateAdd(llvm_size, padding);
+        Value *padding_bytes = ConstantInt::get(llvm_size->getType(), padding * type.bytes());
+        llvm_size = builder->CreateAdd(llvm_size, padding_bytes);
         llvm_size = builder->CreateSelect(llvm_condition,
                                           llvm_size,
                                           ConstantInt::get(llvm_size->getType(), 0));
@@ -364,7 +357,7 @@ void CodeGen_Posix::visit(const Allocate *alloc) {
 
     Allocation allocation = create_allocation(alloc->name, alloc->type, alloc->memory_type,
                                               alloc->extents, alloc->condition,
-                                              alloc->new_expr, alloc->free_function);
+                                              alloc->new_expr, alloc->free_function, alloc->padding);
     sym_push(alloc->name, allocation.ptr);
 
     codegen(alloc->body);
diff --git a/src/CodeGen_Posix.h b/src/CodeGen_Posix.h
index 7a0101080368..41eb923cdc3d 100644
--- a/src/CodeGen_Posix.h
+++ b/src/CodeGen_Posix.h
@@ -28,12 +28,6 @@ class CodeGen_Posix : public CodeGen_LLVM {
     void visit(const Free *) override;
     // @}
 
-    /** It can be convenient for backends to assume there is extra
-     * padding beyond the end of a buffer to enable faster
-     * loads/stores. This function gets the padding required by the
-     * implementing target. */
-    virtual int allocation_padding(Type type) const;
-
     /** A struct describing heap or stack allocations. */
     struct Allocation {
         /** The memory */
@@ -102,8 +96,8 @@ class CodeGen_Posix : public CodeGen_LLVM {
      * When the allocation can be freed call 'free_allocation', and
      * when it goes out of scope call 'destroy_allocation'. */
     Allocation create_allocation(const std::string &name, Type type, MemoryType memory_type,
-                                 const std::vector<Expr> &extents,
-                                 const Expr &condition, const Expr &new_expr, std::string free_function);
+                                 const std::vector<Expr> &extents, const Expr &condition,
+                                 const Expr &new_expr, std::string free_function, int padding);
 
     /** Free an allocation previously allocated with
      * create_allocation */
diff --git a/src/EarlyFree.cpp b/src/EarlyFree.cpp
index d72815bcf200..35de3c15cbcd 100644
--- a/src/EarlyFree.cpp
+++ b/src/EarlyFree.cpp
@@ -164,7 +164,7 @@ class InjectEarlyFrees : public IRMutator {
             stmt = Allocate::make(alloc->name, alloc->type, alloc->memory_type,
                                   alloc->extents, alloc->condition,
                                   Block::make(alloc->body, Free::make(alloc->name)),
-                                  alloc->new_expr, alloc->free_function);
+                                  alloc->new_expr, alloc->free_function, alloc->padding);
         }
         return stmt;
     }
diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index 803bd0b85b8f..f48bd75c37a2 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -118,40 +118,17 @@ class FlattenRamps : public IRMutator {
     }
 };
 
-/** Simplify bit concatenation of interleaved loads to vector reinterprets of
- * dense loads. Must be done to both vectors and scalars after flattening nested
- * ramps, because it can expand a flat ramp into a wider one. */
-class SimplifyConcatBits : public IRMutator {
+/** Lower bit concatenation into vector interleaving followed by a vector
+ * reinterpret. */
+class LowerConcatBits : public IRMutator {
     using IRMutator::visit;
 
     Expr visit(const Call *op) override {
         if (op->is_intrinsic(Call::concat_bits)) {
-            // Simplify a concat of a load of adjacent bits to a reinterpret of a load of a small vector.
-            const Load *l0 = op->args[0].as<Load>();
-            bool ok = true;
-            const int n = (int)(op->args.size());
-            for (int i = 0; ok && i < n; i++) {
-                const Load *li = op->args[i].as<Load>();
-                ok &= (li != nullptr);
-                if (!ok) {
-                    break;
-                }
-                const Ramp *r = li->index.as<Ramp>();
-                Expr base = r ? r->base : li->index;
-                ok &= (is_const_one(li->predicate) &&
-                       l0->name == li->name &&
-                       can_prove(l0->index + i == li->index) &&
-                       (r == nullptr || is_const(r->stride, n)));
-            }
-
-            if (ok) {
-                internal_assert(l0);
-                const Ramp *r0 = l0->index.as<Ramp>();
-                int new_lanes = (r0 ? r0->lanes : 1) * n;
-                Expr base = r0 ? r0->base : l0->index;
-                Expr idx = Ramp::make(base, 1, new_lanes);
-                return mutate(Reinterpret::make(op->type, Load::make(l0->type.with_lanes(n * l0->type.lanes()), l0->name, idx, l0->image, l0->param, const_true(new_lanes), l0->alignment)));
-            }
+            // Rewrite concat_bits into a shuffle followed by a vector reinterpret.
+            Expr shuf = simplify(Shuffle::make_interleave(op->args));
+            Expr e = Reinterpret::make(op->type, shuf);
+            return mutate(e);
         }
 
         return IRMutator::visit(op);
@@ -161,11 +138,11 @@ class SimplifyConcatBits : public IRMutator {
 }  // namespace
 
 Stmt flatten_nested_ramps(const Stmt &s) {
-    return SimplifyConcatBits().mutate(FlattenRamps().mutate(s));
+    return LowerConcatBits().mutate(FlattenRamps().mutate(s));
 }
 
 Expr flatten_nested_ramps(const Expr &e) {
-    return SimplifyConcatBits().mutate(FlattenRamps().mutate(e));
+    return LowerConcatBits().mutate(FlattenRamps().mutate(e));
 }
 
 }  // namespace Internal
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 8c8a189f0171..88ab25e9038b 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -1996,6 +1996,22 @@ class OptimizeShuffles : public IRMutator {
         return visit_let<Stmt>(op);
     }
 
+    set<string> allocations_to_pad;
+    Stmt visit(const Allocate *op) override {
+        Stmt s = IRMutator::visit(op);
+        if (allocations_to_pad.count(op->name)) {
+            op = s.as<Allocate>();
+            internal_assert(op);
+            int padding = 128 / op->type.bytes();  // One native vector
+            return Allocate::make(op->name, op->type, op->memory_type,
+                                  op->extents, op->condition,
+                                  op->body, op->new_expr, op->free_function,
+                                  std::max(op->padding, padding));
+        } else {
+            return s;
+        }
+    }
+
     Expr visit(const Load *op) override {
         if (!is_const_one(op->predicate)) {
             // TODO(psuriana): We shouldn't mess with predicated load for now.
@@ -2031,8 +2047,9 @@ class OptimizeShuffles : public IRMutator {
 
                     // Load all of the possible indices loaded from the
                     // LUT. Note that for clamped ramps, this loads up to 1
-                    // vector past the max. CodeGen_Hexagon::allocation_padding
-                    // returns a native vector size to account for this.
+                    // vector past the max, so we will add padding to the
+                    // allocation accordingly (if we're the one that made it).
+                    allocations_to_pad.insert(op->name);
                     Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
                                           Ramp::make(base, 1, const_extent),
                                           op->image, op->param, const_true(const_extent), alignment);
diff --git a/src/IR.cpp b/src/IR.cpp
index 2ef454836517..244d142cfb60 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -409,7 +409,7 @@ Stmt Provide::make(const std::string &name, const std::vector<Expr> &values, con
 Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type,
                     const std::vector<Expr> &extents,
                     Expr condition, Stmt body,
-                    Expr new_expr, const std::string &free_function) {
+                    Expr new_expr, const std::string &free_function, int padding) {
     for (const auto &extent : extents) {
         internal_assert(extent.defined()) << "Allocate of undefined extent\n";
         internal_assert(extent.type().is_scalar() == 1) << "Allocate of vector extent\n";
@@ -417,6 +417,8 @@ Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type,
     internal_assert(body.defined()) << "Allocate of undefined\n";
     internal_assert(condition.defined()) << "Allocate with undefined condition\n";
     internal_assert(condition.type().is_bool()) << "Allocate condition is not boolean\n";
+    internal_assert(!(new_expr.defined() && padding))
+        << "Allocate nodes with custom new expressions may not have padding\n";
 
     Allocate *node = new Allocate;
     node->name = name;
@@ -426,6 +428,7 @@ Stmt Allocate::make(const std::string &name, Type type, MemoryType memory_type,
     node->new_expr = std::move(new_expr);
     node->free_function = free_function;
     node->condition = std::move(condition);
+    node->padding = padding;
     node->body = std::move(body);
     return node;
 }
diff --git a/src/IR.h b/src/IR.h
index 1e52c296208f..c187053db81d 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -365,6 +365,8 @@ struct Allocate : public StmtNode<Allocate> {
     Type type;
     MemoryType memory_type;
     std::vector<Expr> extents;
+
+    // A boolean condition that determines if the allocation needs to be made at all.
     Expr condition;
 
     // These override the code generator dependent malloc and free
@@ -377,18 +379,22 @@ struct Allocate : public StmtNode<Allocate> {
     Expr new_expr;
     std::string free_function;
 
+    // Extra padding elements to allow for overreads. Elements in the padding
+    // have undetermined values, but are guaranteed safe to load.
+    int padding;
+
     Stmt body;
 
     static Stmt make(const std::string &name, Type type, MemoryType memory_type,
                      const std::vector<Expr> &extents,
                      Expr condition, Stmt body,
-                     Expr new_expr = Expr(), const std::string &free_function = std::string());
+                     Expr new_expr = Expr(), const std::string &free_function = std::string(), int padding = 0);
 
     /** A routine to check if the extents are all constants, and if so verify
      * the total size is less than 2^31 - 1. If the result is constant, but
      * overflows, this routine asserts. This returns 0 if the extents are
      * not all constants; otherwise, it returns the total constant allocation
-     * size. */
+     * size. Does not include any padding bytes. */
     static int32_t constant_allocation_size(const std::vector<Expr> &extents, const std::string &name);
     int32_t constant_allocation_size() const;
 
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 20cb616d2c32..b2511c778821 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -649,6 +649,11 @@ bool graph_equal(const Expr &a, const Expr &b) {
     return IRComparer(&cache).compare_expr(a, b) == IRComparer::Equal;
 }
 
+bool graph_less_than(const Expr &a, const Expr &b) {
+    IRCompareCache cache(8);
+    return IRComparer(&cache).compare_expr(a, b) == IRComparer::LessThan;
+}
+
 bool equal(const Stmt &a, const Stmt &b) {
     return IRComparer().compare_stmt(a, b) == IRComparer::Equal;
 }
@@ -658,6 +663,11 @@ bool graph_equal(const Stmt &a, const Stmt &b) {
     return IRComparer(&cache).compare_stmt(a, b) == IRComparer::Equal;
 }
 
+bool graph_less_than(const Stmt &a, const Stmt &b) {
+    IRCompareCache cache(8);
+    return IRComparer(&cache).compare_stmt(a, b) == IRComparer::LessThan;
+}
+
 bool IRDeepCompare::operator()(const Expr &a, const Expr &b) const {
     IRComparer cmp;
     cmp.compare_expr(a, b);
diff --git a/src/IREquality.h b/src/IREquality.h
index b4d5a9a36cc5..1d59d19446e2 100644
--- a/src/IREquality.h
+++ b/src/IREquality.h
@@ -116,6 +116,12 @@ bool graph_equal(const Expr &a, const Expr &b);
 bool graph_equal(const Stmt &a, const Stmt &b);
 // @}
 
+/** Order unsanitized IRNodes for use in a map key */
+// @{
+bool graph_less_than(const Expr &a, const Expr &b);
+bool graph_less_than(const Stmt &a, const Stmt &b);
+// @}
+
 void ir_equality_test();
 
 }  // namespace Internal
diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index 005937a17008..85b1d143568a 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -251,7 +251,7 @@ Stmt IRMutator::visit(const Allocate *op) {
     }
     return Allocate::make(op->name, op->type, op->memory_type,
                           new_extents, std::move(condition),
-                          std::move(body), std::move(new_expr), op->free_function);
+                          std::move(body), std::move(new_expr), op->free_function, op->padding);
 }
 
 Stmt IRMutator::visit(const Free *op) {
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index 38f57e46649e..3b4aa2ea424c 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -884,10 +884,18 @@ void IRPrinter::visit(const Provide *op) {
 void IRPrinter::visit(const Allocate *op) {
     ScopedBinding<> bind(known_type, op->name);
     stream << get_indent() << "allocate " << op->name << "[" << op->type;
+    bool first = true;
     for (const auto &extent : op->extents) {
         stream << " * ";
+        if (first && op->padding) {
+            stream << "(";
+            first = false;
+        }
         print(extent);
     }
+    if (op->padding) {
+        stream << " + " << op->padding << ")";
+    }
     stream << "]";
     if (op->memory_type != MemoryType::Auto) {
         stream << " in " << op->memory_type;
diff --git a/src/InjectHostDevBufferCopies.cpp b/src/InjectHostDevBufferCopies.cpp
index d48e8b5f2601..6cdaabaeea1f 100644
--- a/src/InjectHostDevBufferCopies.cpp
+++ b/src/InjectHostDevBufferCopies.cpp
@@ -693,7 +693,7 @@ class InjectBufferCopies : public IRMutator {
             }
 
             return Allocate::make(op->name, op->type, op->memory_type, op->extents,
-                                  condition, body, op->new_expr, op->free_function);
+                                  condition, body, op->new_expr, op->free_function, op->padding);
         }
     }
 
diff --git a/src/LICM.cpp b/src/LICM.cpp
index 386a05bd1808..0f4ce93824b8 100644
--- a/src/LICM.cpp
+++ b/src/LICM.cpp
@@ -618,7 +618,7 @@ class HoistIfStatements : public IRMutator {
                 is_pure(i->condition)) {
                 Stmt s = Allocate::make(op->name, op->type, op->memory_type,
                                         op->extents, op->condition, i->then_case,
-                                        op->new_expr, op->free_function);
+                                        op->new_expr, op->free_function, op->padding);
                 return IfThenElse::make(i->condition, s);
             }
         }
@@ -627,7 +627,7 @@ class HoistIfStatements : public IRMutator {
         } else {
             return Allocate::make(op->name, op->type, op->memory_type,
                                   op->extents, op->condition, body,
-                                  op->new_expr, op->free_function);
+                                  op->new_expr, op->free_function, op->padding);
         }
     }
 
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 38ad867686e6..6fa07736b352 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -62,6 +62,7 @@
 #include "SkipStages.h"
 #include "SlidingWindow.h"
 #include "SplitTuples.h"
+#include "StageStridedLoads.h"
 #include "StorageFlattening.h"
 #include "StorageFolding.h"
 #include "StrictifyFloat.h"
@@ -336,6 +337,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = simplify(s);
     log("Lowering after partitioning loops:", s);
 
+    debug(1) << "Staging strided loads...\n";
+    s = stage_strided_loads(s);
+    log("Lowering after staging strided loads:", s);
+
     debug(1) << "Trimming loops to the region over which they do something...\n";
     s = trim_no_ops(s);
     log("Lowering after loop trimming:", s);
diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp
index e8cd8f39b10d..ac61cb94741e 100644
--- a/src/LowerWarpShuffles.cpp
+++ b/src/LowerWarpShuffles.cpp
@@ -435,7 +435,7 @@ class LowerWarpShuffles : public IRMutator {
                 allocation_info.pop(alloc->name);
                 body = Allocate::make(alloc->name, alloc->type, alloc->memory_type,
                                       {new_size}, alloc->condition,
-                                      body, alloc->new_expr, alloc->free_function);
+                                      body, alloc->new_expr, alloc->free_function, alloc->padding);
             }
             allocations.clear();
 
@@ -451,7 +451,7 @@ class LowerWarpShuffles : public IRMutator {
                 const Allocate *alloc = s.as<Allocate>();
                 body = Allocate::make(alloc->name, alloc->type, alloc->memory_type,
                                       alloc->extents, alloc->condition,
-                                      body, alloc->new_expr, alloc->free_function);
+                                      body, alloc->new_expr, alloc->free_function, alloc->padding);
             }
             allocations.clear();
 
diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp
index 678752f248f9..6c69d03e2bad 100644
--- a/src/PartitionLoops.cpp
+++ b/src/PartitionLoops.cpp
@@ -877,7 +877,8 @@ class RenormalizeGPULoops : public IRMutator {
                 return IRMutator::visit(op);
             } else {
                 Stmt inner = LetStmt::make(op->name, op->value, a->body);
-                inner = Allocate::make(a->name, a->type, a->memory_type, a->extents, a->condition, inner);
+                inner = Allocate::make(a->name, a->type, a->memory_type, a->extents, a->condition, inner,
+                                       a->new_expr, a->free_function, a->padding);
                 return mutate(inner);
             }
         } else {
@@ -912,7 +913,8 @@ class RenormalizeGPULoops : public IRMutator {
             Stmt inner = IfThenElse::make(op->condition, allocate_a->body, allocate_b->body);
             inner = Allocate::make(allocate_a->name, allocate_a->type,
                                    allocate_a->memory_type, allocate_a->extents,
-                                   allocate_a->condition, inner);
+                                   allocate_a->condition, inner, allocate_a->new_expr,
+                                   allocate_a->free_function, allocate_a->padding);
             return mutate(inner);
         } else if (let_a && let_b && let_a->name == let_b->name) {
             string condition_name = unique_name('t');
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 28a90aeaf7a1..a2fa2984fa18 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -235,7 +235,8 @@ class InjectProfiling : public IRMutator {
             stmt = op;
         } else {
             stmt = Allocate::make(op->name, op->type, op->memory_type,
-                                  new_extents, condition, body, new_expr, op->free_function);
+                                  new_extents, condition, body, new_expr,
+                                  op->free_function, op->padding);
         }
 
         tasks.push_back(stmt);
diff --git a/src/RemoveDeadAllocations.cpp b/src/RemoveDeadAllocations.cpp
index 101607b7c6a0..33a1a0190b07 100644
--- a/src/RemoveDeadAllocations.cpp
+++ b/src/RemoveDeadAllocations.cpp
@@ -62,8 +62,8 @@ class RemoveDeadAllocations : public IRMutator {
         } else if (body.same_as(op->body)) {
             return op;
         } else {
-            return Allocate::make(op->name, op->type, op->memory_type, op->extents,
-                                  op->condition, body, op->new_expr, op->free_function);
+            return Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition,
+                                  body, op->new_expr, op->free_function, op->padding);
         }
     }
 
diff --git a/src/RemoveUndef.cpp b/src/RemoveUndef.cpp
index a4889f6cc3b5..6a84edf84e2f 100644
--- a/src/RemoveUndef.cpp
+++ b/src/RemoveUndef.cpp
@@ -472,7 +472,8 @@ class RemoveUndef : public IRMutator {
             return op;
         } else {
             return Allocate::make(op->name, op->type, op->memory_type,
-                                  new_extents, condition, body, new_expr, op->free_function);
+                                  new_extents, condition, body, new_expr,
+                                  op->free_function, op->padding);
         }
     }
 
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 36df4df455b1..b1bac9f361d3 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -391,7 +391,7 @@ Stmt Simplify::visit(const Allocate *op) {
         // else case must not use it.
         Stmt stmt = Allocate::make(op->name, op->type, op->memory_type,
                                    new_extents, condition, body_if->then_case,
-                                   new_expr, op->free_function);
+                                   new_expr, op->free_function, op->padding);
         return IfThenElse::make(body_if->condition, stmt, body_if->else_case);
     } else if (all_extents_unmodified &&
                body.same_as(op->body) &&
@@ -401,7 +401,7 @@ Stmt Simplify::visit(const Allocate *op) {
     } else {
         return Allocate::make(op->name, op->type, op->memory_type,
                               new_extents, condition, body,
-                              new_expr, op->free_function);
+                              new_expr, op->free_function, op->padding);
     }
 }
 
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
new file mode 100644
index 000000000000..feeab56a4122
--- /dev/null
+++ b/src/StageStridedLoads.cpp
@@ -0,0 +1,368 @@
+#include "StageStridedLoads.h"
+#include "CSE.h"
+#include "IREquality.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "IRVisitor.h"
+#include "Scope.h"
+#include "Simplify.h"
+#include "Substitute.h"
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+class FindStridedLoads : public IRVisitor {
+public:
+    struct Key {
+        // The buffer being accessed.
+        std::string buf;
+        // The base index being accessed, without any constant offset.
+        Expr base;
+
+        // The stride and lanes of the vector access.
+        int64_t stride;
+        int lanes;
+
+        // The loaded type.
+        Type type;
+
+        // The Allocate node the load belongs to. nullptr for loads from external buffers.
+        const Allocate *allocation;
+
+        // The Stmt over which the load definitely happens, and definitely
+        // refers to the same buffer as other loads with the same name. nullptr
+        // means global scope.
+        const IRNode *scope;
+
+        bool operator<(const Key &other) const {
+            // Check fields in order of cost to compare
+            if (stride < other.stride) {
+                return true;
+            } else if (stride > other.stride) {
+                return false;
+            } else if (lanes < other.lanes) {
+                return true;
+            } else if (lanes > other.lanes) {
+                return false;
+            } else if (scope < other.scope) {
+                return true;
+            } else if (scope > other.scope) {
+                return false;
+            } else if (allocation < other.allocation) {
+                return true;
+            } else if (allocation > other.allocation) {
+                return false;
+            } else if (type < other.type) {
+                return true;
+            } else if (other.type < type) {
+                return false;
+            } else if (buf < other.buf) {
+                return true;
+            } else if (buf > other.buf) {
+                return false;
+            } else {
+                return graph_less_than(base, other.base);
+            }
+        }
+    };
+    // Entry entry maps from an offset from the base to a vector of identical
+    // Load nodes with that offset.
+    std::map<Key, std::map<int64_t, std::vector<const Load *>>> found_loads;
+
+    // The current scope over which accesses definitely occur.
+    const IRNode *scope = nullptr;
+
+    Scope<const Allocate *> allocation_scope;
+
+    std::map<const IRNode *, const IRNode *> parent_scope;
+
+protected:
+    void visit(const Load *op) override {
+        if (is_const_one(op->predicate)) {
+            // We want to give ourselves the best possible chance at recognizing
+            // a naked Ramp, so we simplify and substitute in lets (and take
+            // care to treat the index expression as a graph until the next
+            // CSE).
+            Expr idx = substitute_in_all_lets(simplify(common_subexpression_elimination(op->index)));
+            if (const Ramp *r = idx.as<Ramp>()) {
+                const int64_t *stride_ptr = as_const_int(r->stride);
+                int64_t stride = stride_ptr ? *stride_ptr : 0;
+                Expr base = r->base;
+                int64_t offset = 0;
+                const Add *base_add = base.as<Add>();
+                const int64_t *offset_ptr = base_add ? as_const_int(base_add->b) : nullptr;
+                if (offset_ptr) {
+                    base = base_add->a;
+                    offset = *offset_ptr;
+                }
+                // TODO: We do not yet handle nested vectorization here for
+                // ramps which have not already collapsed. We could potentially
+                // handle more interesting types of shuffle than simple flat slices.
+                if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) {
+                    const IRNode *s = scope;
+                    const Allocate *a = nullptr;
+                    if (allocation_scope.contains(op->name)) {
+                        a = allocation_scope.get(op->name);
+                    }
+                    found_loads[Key{op->name, base, stride, r->lanes, op->type, a, s}][offset].push_back(op);
+                }
+            }
+        }
+        IRVisitor::visit(op);
+    }
+
+    void visit(const For *op) override {
+        if (can_prove(op->extent > 0)) {
+            // The loop body definitely runs
+            IRVisitor::visit(op);
+        } else {
+            const IRNode *child_scope = op->body.get();
+            parent_scope[child_scope] = scope;
+            ScopedValue<const IRNode *> bind(scope, child_scope);
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const IfThenElse *op) override {
+        op->condition.accept(this);
+        {
+            const IRNode *child_scope = op->then_case.get();
+            parent_scope[child_scope] = scope;
+            ScopedValue<const IRNode *> bind(scope, child_scope);
+            op->then_case.accept(this);
+        }
+        if (op->else_case.defined()) {
+            const IRNode *child_scope = op->else_case.get();
+            parent_scope[child_scope] = scope;
+            ScopedValue<const IRNode *> bind(scope, child_scope);
+            op->else_case.accept(this);
+        }
+    }
+
+    void visit(const Allocate *op) override {
+        // Provide a mapping from load nodes to paddable allocations they belong
+        // to.
+        ScopedBinding<const Allocate *> bind(allocation_scope, op->name, op);
+        IRVisitor::visit(op);
+    }
+
+    using IRVisitor::visit;
+};
+
+// Replace a bunch of load expressions in a stmt
+class ReplaceStridedLoads : public IRMutator {
+public:
+    std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
+    std::map<const Allocate *, int> padding;
+    Scope<const Allocate *> allocation_scope;
+
+protected:
+    Expr visit(const Load *op) override {
+        const Allocate *alloc = nullptr;
+        if (allocation_scope.contains(op->name)) {
+            alloc = allocation_scope.get(op->name);
+        }
+        auto it = replacements.find({alloc, op});
+        if (it != replacements.end()) {
+            return mutate(it->second);
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    Stmt visit(const Allocate *op) override {
+        ScopedBinding bind(allocation_scope, op->name, op);
+        auto it = padding.find(op);
+        Stmt s = IRMutator::visit(op);
+        if (it == padding.end()) {
+            return s;
+        } else {
+            op = s.as<Allocate>();
+            internal_assert(op);
+            return Allocate::make(op->name, op->type, op->memory_type,
+                                  op->extents, op->condition,
+                                  op->body, op->new_expr, op->free_function,
+                                  std::max(it->second, op->padding));
+        }
+    }
+
+    using IRMutator::visit;
+};
+
+}  // namespace
+
+Stmt stage_strided_loads(const Stmt &s) {
+    FindStridedLoads finder;
+    ReplaceStridedLoads replacer;
+
+    // Find related clusters of strided loads anywhere in the stmt. While this
+    // appears to look globally, it requires expressions to match exactly, so
+    // really it's only going to find things inside the same loops and let
+    // statements.
+    s.accept(&finder);
+
+    for (const auto &l : finder.found_loads) {
+        const FindStridedLoads::Key &k = l.first;
+        const Allocate *alloc = k.allocation;
+        const std::map<int64_t, std::vector<const Load *>> &v = l.second;
+
+        // Find clusters of strided loads that can share the same dense load.
+        for (auto load = v.begin(); load != v.end();) {
+            // If there is any other load at the same base at an offset at least
+            // stride-1 ahead, it's safe to do a big dense load. Note that we're
+            // assuming that it's always valid to load addresses between two
+            // valid addresses, which rules out games involving protected pages
+            // at the end of scanlines.
+            const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
+
+            if (!can_lift) {
+                load++;
+                continue;
+            }
+
+            // We have a complete cluster of loads. Make a single dense load
+            int lanes = k.lanes * k.stride;
+            int64_t first_offset = load->first;
+            Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
+            Type t = k.type.with_lanes(lanes);
+            const Load *op = load->second[0];
+            Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
+                                          const_true(lanes), op->alignment);
+            shared_load = common_subexpression_elimination(shared_load);
+            for (; load != v.end() && load->first < first_offset + k.stride; load++) {
+                Expr shuf = Shuffle::make_slice(shared_load, load->first - first_offset, k.stride, k.lanes);
+                for (const Load *l : load->second) {
+                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                }
+            }
+        }
+
+        // Do the same in reverse to pick up any loads that didn't get
+        // picked up in a cluster, but for whom we know it's safe to do a
+        // dense load before their start.
+        for (auto load = v.rbegin(); load != v.rend(); load++) {
+            if (replacer.replacements.count({alloc, load->second[0]})) {
+                continue;
+            }
+            int64_t delta = k.stride - 1;
+            const bool can_lift = l.second.upper_bound(load->first - delta) != l.second.begin();
+            if (!can_lift) {
+                continue;
+            }
+            int lanes = k.lanes * k.stride;
+            int64_t first_offset = load->first - delta;
+            Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
+            Type t = k.type.with_lanes(lanes);
+            const Load *op = load->second[0];
+            Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
+                                         const_true(lanes), op->alignment - delta);
+            dense_load = common_subexpression_elimination(dense_load);
+            Expr shuf = Shuffle::make_slice(dense_load, delta, k.stride, k.lanes);
+            for (const Load *l : load->second) {
+                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+            }
+        }
+
+        // Look for any loads we can densify because an overlapping load occurs
+        // in any parent scope.
+        for (auto load = v.rbegin(); load != v.rend(); load++) {
+            if (replacer.replacements.count({alloc, load->second[0]})) {
+                continue;
+            }
+            int64_t min_offset = load->first;
+            int64_t max_offset = load->first;
+            const IRNode *scope = k.scope;
+            while (scope) {
+                const IRNode *parent = finder.parent_scope[scope];
+                auto parent_key = k;
+                parent_key.scope = parent;
+                auto it = finder.found_loads.find(parent_key);
+                if (it != finder.found_loads.end() && !it->second.empty()) {
+                    min_offset = std::min(it->second.begin()->first, min_offset);
+                    max_offset = std::max(it->second.rbegin()->first, max_offset);
+                }
+                scope = parent;
+            }
+
+            if (max_offset - min_offset < k.stride - 1) {
+                continue;
+            }
+            int64_t offset = std::max(load->first - (k.stride - 1), min_offset);
+            int lanes = k.lanes * k.stride;
+            Expr idx = Ramp::make(k.base + (int)offset, make_one(k.base.type()), lanes);
+            Type t = k.type.with_lanes(lanes);
+            const Load *op = load->second[0];
+            Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
+                                         const_true(lanes), op->alignment);
+            dense_load = common_subexpression_elimination(dense_load);
+            Expr shuf = Shuffle::make_slice(dense_load, load->first - offset, k.stride, k.lanes);
+            for (const Load *l : load->second) {
+                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+            }
+        }
+
+        // Densify any remaining strided loads to internal allocations by
+        // padding the allocation, and densify any remaining strided loads to
+        // external allocations by doing a dense load at a trimmed size. We rely
+        // on codegen to do a good job at loading vectors of a funny size.
+        for (auto load = v.begin(); load != v.end(); load++) {  // NOLINT
+            if (replacer.replacements.count({alloc, load->second[0]})) {
+                continue;
+            }
+
+            int lanes = k.lanes * k.stride;
+
+            bool may_pad = k.allocation && !k.allocation->new_expr.defined();
+            int delta = (int)(k.stride - 1);
+
+            if (may_pad) {
+                auto p = replacer.padding.insert({k.allocation, delta});
+                if (!p.second) {
+                    p.first->second = std::max(p.first->second, delta);
+                }
+
+                int64_t first_offset = load->first;
+                Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
+                Type t = k.type.with_lanes(lanes);
+                const Load *op = load->second[0];
+                Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
+                                             const_true(lanes), op->alignment);
+                dense_load = common_subexpression_elimination(dense_load);
+                Expr shuf = Shuffle::make_slice(dense_load, load->first - first_offset, k.stride, k.lanes);
+                for (const Load *l : load->second) {
+                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                }
+
+            } else if (k.lanes % 2 == 0) {
+                // Do two overlapping half-sized dense loads and mush them together.
+                int64_t first_offset = load->first;
+                int half_lanes = lanes / 2;
+                internal_assert(delta <= half_lanes);
+                Expr idx1 = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), half_lanes);
+
+                Expr idx2 = Ramp::make(k.base + (int)first_offset + half_lanes - delta, make_one(k.base.type()), half_lanes);
+                Type t = k.type.with_lanes(half_lanes);
+                const Load *op = load->second[0];
+                Expr dense_load1 = Load::make(t, k.buf, idx1, op->image, op->param,
+                                              const_true(half_lanes), op->alignment);
+                Expr dense_load2 = Load::make(t, k.buf, idx2, op->image, op->param,
+                                              const_true(half_lanes), op->alignment + half_lanes - delta);
+                dense_load1 = common_subexpression_elimination(dense_load1);
+                dense_load2 = common_subexpression_elimination(dense_load2);
+                Expr shuf1 = Shuffle::make_slice(dense_load1, 0, k.stride, k.lanes / 2);
+                Expr shuf2 = Shuffle::make_slice(dense_load2, delta, k.stride, k.lanes / 2);
+                Expr shuf = Shuffle::make_concat({shuf1, shuf2});
+                for (const Load *l : load->second) {
+                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                }
+            }
+        }
+    }
+
+    return replacer.mutate(s);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h
new file mode 100644
index 000000000000..a29cef2438f1
--- /dev/null
+++ b/src/StageStridedLoads.h
@@ -0,0 +1,45 @@
+#ifndef HALIDE_INTERNAL_STAGE_STRIDED_LOADS_H
+#define HALIDE_INTERNAL_STAGE_STRIDED_LOADS_H
+
+/** \file
+ *
+ * Defines the compiler pass that converts strided loads into dense loads
+ * followed by shuffles.
+ */
+
+#include "Expr.h"
+
+namespace Halide {
+namespace Internal {
+
+/** Convert all unpredicated strided loads in a Stmt into dense loads followed
+ * by shuffles.
+ *
+ * For a stride of two, the trick is to do a dense load of twice the size, and
+ * then extract either the even or odd lanes. This was previously done in
+ * codegen, where it was challenging, because it's not easy to know there if
+ * it's safe to do the double-sized load, as it either loads one element beyond
+ * or before the original load. We used the alignment of the ramp base to try to
+ * tell if it was safe to shift backwards, and we added padding to internal
+ * allocations so that for those at least it was safe to shift
+ * forwards. Unfortunately the alignment of the ramp base is usually unknown if
+ * you don't know anything about the strides of the input, and adding padding to
+ * allocations was a serious wart in our memory allocators.
+ *
+ * This pass instead actively looks for evidence elsewhere in the Stmt (at some
+ * location which definitely executes whenever the load being transformed
+ * executes) that it's safe to read further forwards or backwards in memory. The
+ * evidence is in the form of a load at the same base address with a different
+ * constant offset. It also clusters groups of these loads so that they do the
+ * same dense load and extract the appropriate slice of lanes. If it fails to
+ * find any evidence, for loads from external buffers it does two overlapping
+ * half-sized dense loads and shuffles out the desired lanes, and for loads from
+ * internal allocations it adds padding to the allocation explicitly, by setting
+ * the padding field on Allocate nodes.
+ */
+Stmt stage_strided_loads(const Stmt &s);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index 814d5e4c6a83..2916fa91a3ae 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -446,7 +446,7 @@ class PromoteToMemoryType : public IRMutator {
         if (t != op->type) {
             return Allocate::make(op->name, t, op->memory_type, mutate(op->extents),
                                   mutate(op->condition), mutate(op->body),
-                                  mutate(op->new_expr), op->free_function);
+                                  mutate(op->new_expr), op->free_function, op->padding);
         } else {
             return IRMutator::visit(op);
         }
diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp
index 332507becf6d..b0deede5b6fe 100644
--- a/src/UnrollLoops.cpp
+++ b/src/UnrollLoops.cpp
@@ -6,6 +6,7 @@
 #include "Simplify.h"
 #include "SimplifyCorrelatedDifferences.h"
 #include "Substitute.h"
+#include "UniquifyVariableNames.h"
 
 using std::pair;
 using std::vector;
@@ -124,7 +125,9 @@ class UnrollLoops : public IRMutator {
 }  // namespace
 
 Stmt unroll_loops(const Stmt &s) {
-    return UnrollLoops().mutate(s);
+    Stmt stmt = UnrollLoops().mutate(s);
+    // Unrolling duplicates variable names. Other passes assume variable names are unique.
+    return uniquify_variable_names(stmt);
 }
 
 }  // namespace Internal
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 7dcd79d24664..91e958c9e196 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1062,6 +1062,12 @@ class VectorSubs : public IRMutator {
             body = substitute(vv.name + ".from_zero", Variable::make(Int(32), vv.name), body);
         }
 
+        // Difficult to tell how the padding should grow when vectorizing an
+        // allocation. It's not currently an issue, because vectorization
+        // happens before the only source of padding (lowering strided
+        // loads). Add an assert to enforce it.
+        internal_assert(op->padding == 0) << "Vectorization of padded allocations not yet implemented";
+
         return Allocate::make(op->name, op->type, op->memory_type, new_extents, op->condition, body, new_expr, op->free_function);
     }
 
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 0e7415a3b0c8..dca5ac9fb2b1 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -287,6 +287,7 @@ tests(GROUPS correctness
       split_reuse_inner_name_bug.cpp
       split_store_compute.cpp
       stack_allocations.cpp
+      stage_strided_loads.cpp
       stencil_chain_in_update_definitions.cpp
       stmt_to_html.cpp
       storage_folding.cpp
diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
index 75088816031a..e096bc262b1b 100644
--- a/test/correctness/simd_op_check_arm.cpp
+++ b/test/correctness/simd_op_check_arm.cpp
@@ -291,35 +291,61 @@ class SimdOpCheckARM : public SimdOpCheckTest {
                 check(arm32 ? "vld1.32" : "ldr", 2 * w, in_f32(x + y));
             }
 
-            // VLD2     X       -       Load Two-Element Structures
-            // These need to be vectorized at least 2 native vectors wide,
-            // so we get a full vectors' worth that we know is safe to
-            // access.
-            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1));
-            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1));
-
-            // VLD3     X       -       Load Three-Element Structures
-            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3));
-            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3));
-            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3));
-            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3));
-            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3));
-            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3));
-            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3));
-
-            // VLD4     X       -       Load Four-Element Structures
-            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4));
-            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4));
-            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4));
-            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4));
-            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4));
-            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4));
-            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4));
+            if (target.os != Target::IOS && target.os != Target::OSX) {
+                // VLD* are not profitable on Apple silicon
+
+                // VLD2     X       -       Load Two-Element Structures
+                // These need to be vectorized at least 2 native vectors wide,
+                // so we get a full vectors' worth that we know is safe to
+                // access.
+                check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2));
+                check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2));
+                check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2));
+                check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2));
+                check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2));
+                check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2));
+                check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2));
+
+                // VLD3     X       -       Load Three-Element Structures
+                check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3));
+                check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3));
+                check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3));
+                check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3));
+                check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3));
+                check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3));
+                check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3));
+
+                // VLD4     X       -       Load Four-Element Structures
+                check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4));
+                check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4));
+                check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4));
+                check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4));
+                check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4));
+                check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4));
+                check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4));
+            } else if (!arm32) {
+                // On Apple Silicon we expect dense loads followed by shuffles.
+                check("uzp1.16b", 32 * w, in_i8(x * 2));
+                check("uzp1.16b", 32 * w, in_u8(x * 2));
+                check("uzp1.8h", 16 * w, in_i16(x * 2));
+                check("uzp1.8h", 16 * w, in_u16(x * 2));
+                check("uzp1.4s", 8 * w, in_i32(x * 2));
+                check("uzp1.4s", 8 * w, in_u32(x * 2));
+                check("uzp1.4s", 8 * w, in_f32(x * 2));
+
+                // VLD3     X       -       Load Three-Element Structures
+                check("tbl.16b", 32 * w, in_i8(x * 3));
+                check("tbl.16b", 32 * w, in_u8(x * 3));
+                check("tbl.16b", 16 * w, in_i16(x * 3));
+                check("tbl.16b", 16 * w, in_u16(x * 3));
+                // For 32-bit types llvm just scalarizes
+
+                // VLD4     X       -       Load Four-Element Structures
+                check("tbl.16b", 32 * w, in_i8(x * 4));
+                check("tbl.16b", 32 * w, in_u8(x * 4));
+                check("tbl.16b", 16 * w, in_i16(x * 4));
+                check("tbl.16b", 16 * w, in_u16(x * 4));
+            }
 
             // VLDM     X       F, D    Load Multiple Registers
             // VLDR     X       F, D    Load Single Register
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
new file mode 100644
index 000000000000..f791385f7c25
--- /dev/null
+++ b/test/correctness/stage_strided_loads.cpp
@@ -0,0 +1,264 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+class CheckForStridedLoads : public IRMutator {
+    using IRMutator::visit;
+
+    Expr visit(const Load *op) override {
+        if (const Ramp *r = op->index.as<Ramp>()) {
+            if (op->name == buf_name) {
+                bool dense = is_const_one(r->stride);
+                found |= !dense;
+                dense_loads += dense;
+            }
+        }
+        return IRMutator::visit(op);
+    }
+
+public:
+    bool found = false;
+    int dense_loads = 0;
+    std::string buf_name;
+
+    void check(Func f, int desired_dense_loads, std::string name = "buf") {
+        found = false;
+        dense_loads = 0;
+        buf_name = name;
+        f.add_custom_lowering_pass(this, nullptr);
+        f.compile_jit();
+        assert(!found);
+        assert(dense_loads == desired_dense_loads);
+    }
+
+    void check_not(Func f, int desired_dense_loads, std::string name = "buf") {
+        found = false;
+        dense_loads = 0;
+        buf_name = name;
+        f.add_custom_lowering_pass(this, nullptr);
+        f.compile_jit();
+        assert(found);
+        assert(dense_loads == desired_dense_loads);
+    }
+} checker;
+
+int main(int argc, char **argv) {
+    ImageParam buf(Float(32), 1, "buf");
+
+    {
+        // Clusters of strided loads can share the same underlying dense load
+        Func f;
+        Var x;
+        f(x) = buf(2 * x) + buf(2 * x + 1);
+        f.vectorize(x, 8, TailStrategy::RoundUp);
+
+        // We expect no strided loads, and one dense load
+        checker.check(f, 1);
+    }
+
+    {
+        Func f;
+        Var x;
+        f(x) = buf(2 * x) + buf(2 * x + 1) + buf(2 * x + 2);
+        f.vectorize(x, 8, TailStrategy::RoundUp);
+
+        // We expect two dense loads in this case. One for the first two taps,
+        // and one for the last tap.
+        checker.check(f, 2);
+    }
+
+    {
+        // Check four taps
+        Func f;
+        Var x;
+        f(x) = (buf(2 * x) + buf(2 * x + 2)) + (buf(2 * x + 1) + buf(2 * x + 3));
+        f.vectorize(x, 8, TailStrategy::RoundUp);
+
+        checker.check(f, 2);
+    }
+
+    {
+        // Check tuples
+        Func f;
+        Var x;
+        f(x) = {0.f, 0.f};
+        f(x) += {buf(2 * x), buf(2 * x + 1)};
+        f.update().vectorize(x, 8, TailStrategy::RoundUp);
+
+        // In this case, the dense load appears twice across the two store
+        // statements for the two tuple components, but it will get deduped by
+        // llvm.
+        checker.check(f, 2);
+    }
+
+    {
+        // Far apart constant offsets is still enough evidence that it's safe to
+        // do a dense load.
+        Func f;
+        Var x;
+        f(x) = buf(2 * x - 123) + buf(2 * x + 134);
+        f.vectorize(x, 8, TailStrategy::RoundUp);
+
+        checker.check(f, 2);
+    }
+
+    {
+        // Load partners can be split across multiple Funcs in the same block
+        Func f, g;
+        Var x;
+        f(x) = buf(2 * x);
+        g(x) = f(x) + buf(2 * x + 1);
+
+        g.vectorize(x, 8, TailStrategy::RoundUp);
+        f.compute_at(g, x).vectorize(x);
+
+        checker.check(g, 2);
+    }
+
+    {
+        // Load partners can be split across update definitions
+        Func f, g;
+        Var x;
+        f(x) = buf(2 * x);
+        f(x) += buf(2 * x + 1);
+        g(x) = f(x);
+        g.vectorize(x, 8, TailStrategy::RoundUp);
+
+        checker.check(g, 2);
+    }
+
+    {
+        // Load clusters can be split across an unrolled variable
+        Func f, g;
+        Var x, c;
+        f(x, c) = buf(4 * x + c) + 4 * x;
+        f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 4).unroll(c).reorder(c, x);
+
+        checker.check(f, 4);
+    }
+
+    {
+        // Load clusters can even be split across unrelated inner loop nests
+        // (provided they are known to have non-zero extent).
+
+        Func f, g, h;
+        Var c, x, y;
+        g(x, y) = buf(2 * x) + y;
+        h(x, y) = buf(2 * x + 1) + y;
+        f(x, y, c) = g(x, y) + h(x, y) + c;
+
+        Var xi, yi;
+        f.tile(x, y, xi, yi, 8, 8, TailStrategy::RoundUp).vectorize(xi).reorder(c, x, y);
+        g.compute_at(f, x).vectorize(x);
+        h.compute_at(f, x).vectorize(x);
+        checker.check(f, 2);
+    }
+
+    // We can always densify strided loads to internal allocations, because we
+    // can just pad the allocation.
+    {
+        Func f, g;
+        Var x;
+
+        f(x) = x;
+        g(x) = f(2 * x);
+        f.compute_at(g, x).vectorize(x);
+        g.vectorize(x, 8, TailStrategy::RoundUp);
+        checker.check(g, 1, f.name());
+    }
+
+    // Strides up to the the vector size are worth densifying. After that, it's better to just gather.
+    {
+        Func f;
+        Var x;
+        f(x) = buf(15 * x) + buf(15 * x + 14);
+        f.vectorize(x, 16, TailStrategy::RoundUp);
+
+        checker.check(f, 1);
+    }
+
+    {
+        Func f;
+        Var x;
+        f(x) = buf(16 * x) + buf(16 * x + 15);
+        f.vectorize(x, 16, TailStrategy::RoundUp);
+
+        checker.check_not(f, 0);
+    }
+
+    // Strided loads to external allocations are handled by doing a weird-sized
+    // dense load and then shuffling.
+    {
+        Func f;
+        Var x;
+        f(x) = buf(3 * x);
+        f.vectorize(x, 8, TailStrategy::RoundUp);
+        checker.check(f, 2);
+    }
+
+    // Make a pair of unconditionally-executed loads, and check that a
+    // conditionally-executed load can use it as evidence that a dense load in
+    // one direction or the other is safe to do.
+    {
+        Func f;
+        Var x;
+        f(x) = buf(2 * x) + buf(2 * x + 1);
+        RDom r1(0, 1), r2(0, 1);
+        Param<bool> p1, p2;
+        r1.where(p1);
+        r2.where(p2);
+        f(x) += buf(2 * x + 3) + r1;
+        f(x) += buf(2 * x - 3) + r2;
+
+        Func g;
+        g(x) = f(x);
+        g.vectorize(x, 8, TailStrategy::RoundUp);
+        f.compute_at(g, x).vectorize(x);
+        f.update(0).vectorize(x);
+        f.update(1).vectorize(x);
+
+        checker.check(g, 3);
+    }
+
+    // Make a case that uses nested vectorization.
+    {
+        Func f;
+        Var x, c;
+
+        f(c, x) = buf(2 * (2 * x + c)) + buf(2 * (2 * x + c) + 1);
+        f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 2).vectorize(c);
+        f.output_buffer().dim(1).set_stride(2);
+        checker.check(f, 1);
+    }
+
+    // Do a variety of weird loads at weird sizes from an external buffer to
+    // test the behaviour that does two half-sized loads.
+    {
+        Buffer<float> data(1024);
+        Buffer<float> out(512);
+        for (int i = 0; i < 1024; i++) {
+            data(i) = i;
+        }
+        buf.set(data);
+        for (int size = 2; size <= 16; size += 2) {
+            for (int stride = 2; stride <= 8; stride++) {
+                Func f;
+                Var x;
+                f(x) = buf(stride * x);
+                f.vectorize(x, size);
+
+                Buffer<float> out = f.realize({1024 / stride});
+                for (int i = 0; i < out.width(); i++) {
+                    if (out(i) != data(stride * i)) {
+                        printf("out(%d) = %f instead of %f\n", i, out(i), data(stride * i));
+                    }
+                }
+            }
+        }
+    }
+
+    printf("Success!\n");
+
+    return 0;
+}
diff --git a/test/performance/stack_vs_heap.cpp b/test/performance/stack_vs_heap.cpp
index d5b42c4df8d1..e5314f77897d 100644
--- a/test/performance/stack_vs_heap.cpp
+++ b/test/performance/stack_vs_heap.cpp
@@ -11,7 +11,9 @@ int main(int argc, char **argv) {
     }
 
     double times[3] = {0.f, 0.f, 0.f};
-    for (int i = 0; i < 3; i++) {
+
+    for (int sz = 1; sz < 32; sz = sz * 2 + 1) {
+
         for (int c = 0; c < 3; c++) {
             MemoryType mem_type;
             bool use_bound;
@@ -31,12 +33,19 @@ int main(int argc, char **argv) {
                 use_bound = false;
             }
 
-            Func f;
             Var x, y;
-            f(x, y) = x / 18.3f + y;
+
+            std::vector<Func> fs;
+            Expr e = 0.0f;
+            for (int j = 0; j < 10; j++) {
+                Func f;
+                f(x, y) = x * j + y;
+                e += f(x, y);
+                fs.push_back(f);
+            }
 
             Func g;
-            g(x, y) = f(x, y) + f(x, y + 1);
+            g(x, y) = e;
 
             Var yo, yi;
             // Place the y loop body in its own function with its own
@@ -44,15 +53,16 @@ int main(int argc, char **argv) {
             // which will be 1 in practice.
             Param<int> task_size;
             g.split(y, yo, yi, task_size).parallel(yi);
-            f.compute_at(g, yi).store_in(mem_type);
-
-            if (use_bound) {
-                f.bound_extent(x, 8);
+            for (auto f : fs) {
+                f.compute_at(g, yi).store_in(mem_type);
+                if (use_bound) {
+                    f.bound_extent(x, sz);
+                }
             }
 
-            Buffer<float> out(8, 1024);
+            Buffer<float> out(sz, 1024);
             task_size.set(1);
-            double t = 1e3 * Tools::benchmark(10, 100, [&]() {
+            double t = 1e3 * Tools::benchmark(10, 1 + 100 / sz, [&]() {
                            g.realize(out);
                        });
             times[c] += t;