From 538577a77c34774875f474f093a536afa3a24b54 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 16 Apr 2024 17:06:16 -0700
Subject: [PATCH 01/13] Rewrite IREquality to use a more compact stack instead
 of deep recursion

Deletes a bunch of code and speeds up lowering time of local laplacian
with 20 pyramid levels by ~2.5%
---
 src/Bounds.cpp            |    4 +-
 src/CSE.cpp               |   18 +-
 src/IREquality.cpp        | 1139 +++++++++++++++++--------------------
 src/IREquality.h          |   89 +--
 src/ParallelRVar.cpp      |    2 +-
 src/RDom.cpp              |    2 +-
 src/ScheduleFunctions.cpp |    2 +-
 7 files changed, 520 insertions(+), 736 deletions(-)
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index d7d337dacfdf..a8ed2deba0d2 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -79,9 +79,9 @@ int static_sign(const Expr &x) {
         return -1;
     } else {
         Expr zero = make_zero(x.type());
-        if (equal(const_true(), simplify(x > zero))) {
+        if (is_const_one(simplify(x > zero))) {
             return 1;
-        } else if (equal(const_true(), simplify(x < zero))) {
+        } else if (is_const_one(simplify(x < zero))) {
             return -1;
         }
     }
diff --git a/src/CSE.cpp b/src/CSE.cpp
index d8ecd619db81..0905562c4e63 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -76,7 +76,7 @@ class GVN : public IRMutator {
         Expr expr;
         int use_count = 0;
         // All consumer Exprs for which this is the last child Expr.
-        map<ExprWithCompareCache, int> uses;
+        map<Expr, int, IRGraphDeepCompare> uses;
         Entry(const Expr &e)
             : expr(e) {
         }
@@ -84,25 +84,15 @@ class GVN : public IRMutator {
     vector<std::unique_ptr<Entry>> entries;
 
     map<Expr, int, ExprCompare> shallow_numbering, output_numbering;
-    map<ExprWithCompareCache, int> leaves;
+    map<Expr, int, IRGraphDeepCompare> leaves;
 
-    int number = -1;
-
-    IRCompareCache cache;
-
-    GVN()
-        : number(0), cache(8) {
-    }
+    int number = 0;
 
     Stmt mutate(const Stmt &s) override {
         internal_error << "Can't call GVN on a Stmt: " << s << "\n";
         return Stmt();
     }
 
-    ExprWithCompareCache with_cache(const Expr &e) {
-        return ExprWithCompareCache(e, &cache);
-    }
-
     Expr mutate(const Expr &e) override {
         // Early out if we've already seen this exact Expr.
         {
@@ -123,7 +113,7 @@ class GVN : public IRMutator {
         // that child has an identical parent to this one.
 
         auto &use_map = number == -1 ? leaves : entries[number]->uses;
-        auto p = use_map.emplace(with_cache(new_e), (int)entries.size());
+        auto p = use_map.emplace(new_e, (int)entries.size());
         auto iter = p.first;
         bool novel = p.second;
         if (novel) {
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 0d21ca1e26b5..90ec38970f5d 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -10,713 +10,590 @@ using std::vector;
 
 namespace {
 
-/** The class that does the work of comparing two IR nodes. */
-class IRComparer : public IRVisitor {
-public:
-    /** Different possible results of a comparison. Unknown should
-     * only occur internally due to a cache miss. */
-    enum CmpResult { Unknown,
-                     Equal,
-                     LessThan,
-                     GreaterThan };
-
-    /** The result of the comparison. Should be Equal, LessThan, or GreaterThan. */
+enum CmpResult { Unknown,
+                 Equal,
+                 LessThan,
+                 GreaterThan };
+
+// A helper class for comparing two pieces of IR with the minimum amount of
+// recursion.
+template<size_t cache_size>
+struct Comparer {
+
+    // Points to any cache in use for comparing Expr graphs. Will be non-null
+    // exactly when cache_size > 0
+    const IRNode **cache;
+
+    // The compare method below does the actual work, but it needs to call out
+    // to a variety of template helper functions to compare specific types. We
+    // make the syntax in the giant switch statement in the compare method much
+    // simpler if we just give these helper functions access to the state in the
+    // compare method: The stack pointers, the currently-considered piece of
+    // IR, and the result of the comparison so far.
+    const IRNode **stack_end = nullptr, **stack_ptr = nullptr;
+    const IRNode *next_a = nullptr, *next_b = nullptr;
     CmpResult result = Equal;
 
-    /** Compare two expressions or statements and return the
-     * result. Returns the result immediately if it is already
-     * non-zero. */
-    // @{
-    CmpResult compare_expr(const Expr &a, const Expr &b);
-    CmpResult compare_stmt(const Stmt &a, const Stmt &b);
-    // @}
-
-    /** If the expressions you're comparing may contain many repeated
-     * subexpressions, it's worth passing in a cache to use.
-     * Currently this is only done in common-subexpression
-     * elimination. */
-    IRComparer(IRCompareCache *c = nullptr)
-        : cache(c) {
+    Comparer(const IRNode **cache)
+        : cache(cache) {
     }
 
-private:
-    Expr expr;
-    Stmt stmt;
-    IRCompareCache *cache;
+    // Compare the given member variable of next_a and next_b. If it's an Expr
+    // or Stmt, it's guaranteed to be defined.
+    template<typename Node, typename MemberType>
+    HALIDE_ALWAYS_INLINE void cmp(MemberType Node::*member_ptr) {
+        if (result == Equal) {
+            cmp(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
+        }
+    }
 
-    CmpResult compare_names(const std::string &a, const std::string &b);
-    CmpResult compare_types(Type a, Type b);
-    CmpResult compare_expr_vector(const std::vector<Expr> &a, const std::vector<Expr> &b);
+    // The same as above, but with no guarantee.
+    template<typename Node, typename MemberType>
+    HALIDE_ALWAYS_INLINE void cmp_if_defined(MemberType Node::*member_ptr) {
+        if (result == Equal) {
+            cmp_if_defined(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
+        }
+    }
 
-    // Compare two things that already have a well-defined operator<
-    template<typename T>
-    CmpResult compare_scalar(T a, T b);
-
-    void visit(const IntImm *) override;
-    void visit(const UIntImm *) override;
-    void visit(const FloatImm *) override;
-    void visit(const StringImm *) override;
-    void visit(const Cast *) override;
-    void visit(const Reinterpret *) override;
-    void visit(const Variable *) override;
-    void visit(const Add *) override;
-    void visit(const Sub *) override;
-    void visit(const Mul *) override;
-    void visit(const Div *) override;
-    void visit(const Mod *) override;
-    void visit(const Min *) override;
-    void visit(const Max *) override;
-    void visit(const EQ *) override;
-    void visit(const NE *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void visit(const GT *) override;
-    void visit(const GE *) override;
-    void visit(const And *) override;
-    void visit(const Or *) override;
-    void visit(const Not *) override;
-    void visit(const Select *) override;
-    void visit(const Load *) override;
-    void visit(const Ramp *) override;
-    void visit(const Broadcast *) override;
-    void visit(const Call *) override;
-    void visit(const Let *) override;
-    void visit(const LetStmt *) override;
-    void visit(const AssertStmt *) override;
-    void visit(const ProducerConsumer *) override;
-    void visit(const For *) override;
-    void visit(const Acquire *) override;
-    void visit(const Store *) override;
-    void visit(const Provide *) override;
-    void visit(const Allocate *) override;
-    void visit(const Free *) override;
-    void visit(const Realize *) override;
-    void visit(const Block *) override;
-    void visit(const Fork *) override;
-    void visit(const IfThenElse *) override;
-    void visit(const Evaluate *) override;
-    void visit(const Shuffle *) override;
-    void visit(const Prefetch *) override;
-    void visit(const Atomic *) override;
-    void visit(const VectorReduce *) override;
-    void visit(const HoistedStorage *) override;
-};
+    size_t hash(const IRNode *a, const IRNode *b) {
+        uintptr_t pa = (uintptr_t)a;
+        uintptr_t pb = (uintptr_t)b;
+        uintptr_t h = (((pa * 17) ^ (pb * 13)) >> 4);
+        h ^= h >> 8;
+        h = h & (cache_size - 1);
+        return h;
+    }
+
+    // See if we've already processed this pair of IR nodes
+    bool cache_contains(const IRNode *a, const IRNode *b) {
+        size_t h = hash(a, b);
+        const IRNode **c = cache + h * 2;
+        return (c[0] == a && c[1] == b);
+    }
+
+    // Mark a pair of IR nodes as already processed. We don't do this until
+    // we're done processing their children, because there aren't going to be
+    // any queries to match a node with one of its children, because nodes can't
+    // be their own ancestors. Inserting it into the cache too soon just means
+    // it's going to be evicted before we need it.
+    void cache_insert(const IRNode *a, const IRNode *b) {
+        size_t h = hash(a, b);
+        const IRNode **c = cache + h * 2;
+        c[0] = a;
+        c[1] = b;
+    }
+
+    // Compare two known-to-be-defined IR nodes. Well... don't actually compare
+    // them because that would be a recursive call. Just push them onto the
+    // pending tasks stack.
+    void cmp(const IRHandle &a, const IRHandle &b) {
+        if (cache_size > 0 && cache_contains(a.get(), b.get())) {
+            return;
+        }
 
-template<typename T>
-IRComparer::CmpResult IRComparer::compare_scalar(T a, T b) {
-    if (result != Equal) {
-        return result;
+        if (a.get() == b.get()) {
+        } else if (stack_ptr == stack_end) {
+            // Out of stack space. Make a recursive call to buy some more stack.
+            Comparer<cache_size> sub_comparer(cache);
+            result = sub_comparer.compare(a.get(), b.get());
+        } else {
+            *stack_ptr++ = a.get();
+            *stack_ptr++ = b.get();
+        }
     }
 
-    if constexpr (std::is_floating_point_v<T>) {
-        // NaNs are equal to each other and less than non-nans
-        if (std::isnan(a) && std::isnan(b)) {
-            result = Equal;
-            return result;
-        }
-        if (std::isnan(a)) {
+    // Compare two IR nodes, which may or may not be defined.
+    HALIDE_ALWAYS_INLINE
+    void cmp_if_defined(const IRHandle &a, const IRHandle &b) {
+        if (a.defined() < b.defined()) {
             result = LessThan;
-            return result;
-        }
-        if (std::isnan(b)) {
+        } else if (a.defined() > b.defined()) {
             result = GreaterThan;
-            return result;
+        } else if (a.defined() && b.defined()) {
+            cmp(a, b);
         }
     }
 
-    if (a < b) {
-        result = LessThan;
-    } else if (a > b) {
-        result = GreaterThan;
-    }
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_expr(const Expr &a, const Expr &b) {
-    if (result != Equal) {
-        return result;
+    template<typename T>
+    void cmp(const std::vector<T> &a, const std::vector<T> &b) {
+        if (a.size() < b.size()) {
+            result = LessThan;
+        } else if (a.size() > b.size()) {
+            result = GreaterThan;
+        } else {
+            for (size_t i = 0; i < a.size() && result == Equal; i++) {
+                cmp(a[i], b[i]);
+            }
+        }
     }
 
-    if (a.same_as(b)) {
-        result = Equal;
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const Range &a, const Range &b) {
+        cmp(a.min, b.min);
+        cmp(a.extent, b.extent);
     }
 
-    // Undefined values are equal to each other and less than defined values
-    if (!a.defined() && !b.defined()) {
-        result = Equal;
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const ModulusRemainder &a, const ModulusRemainder &b) {
+        cmp(a.modulus, b.modulus);
+        cmp(a.remainder, b.remainder);
     }
 
-    if (!a.defined()) {
-        result = LessThan;
-        return result;
+    void cmp(const halide_handle_cplusplus_type *ha,
+             const halide_handle_cplusplus_type *hb) {
+        if (ha == hb) {
+            return;
+        } else if (!ha) {
+            result = LessThan;
+        } else if (!hb) {
+            result = GreaterThan;
+        } else {
+            // They're both non-void handle types with distinct type info
+            // structs. We now need to distinguish between different C++
+            // pointer types (e.g. char * vs const float *). If would be nice
+            // if the structs were unique per C++ type. Then comparing the
+            // pointers above would be sufficient.  Unfortunately, different
+            // shared libraries in the same process each create a distinct
+            // struct for the same type. We therefore have to do a deep
+            // comparison of the type info fields.
+            cmp(ha->reference_type, hb->reference_type);
+            cmp(ha->inner_name.name, hb->inner_name.name);
+            cmp(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type);
+            cmp(ha->namespaces, hb->namespaces);
+            cmp(ha->enclosing_types, hb->enclosing_types);
+            cmp(ha->cpp_type_modifiers, hb->cpp_type_modifiers);
+        }
     }
 
-    if (!b.defined()) {
-        result = GreaterThan;
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const Type &a, const Type &b) {
+        uint32_t ta = ((halide_type_t)a).as_u32();
+        uint32_t tb = ((halide_type_t)b).as_u32();
+        if (ta < tb) {
+            result = LessThan;
+        } else if (ta > tb) {
+            result = GreaterThan;
+        } else {
+            if (a.handle_type || b.handle_type) {
+                cmp(a.handle_type, b.handle_type);
+            }
+        }
     }
 
-    // If in the future we have hashes for Exprs, this is a good place
-    // to compare the hashes:
-    // if (compare_scalar(a.hash(), b.hash()) != Equal) {
-    //   return result;
-    // }
-
-    if (compare_scalar(a->node_type, b->node_type) != Equal) {
-        return result;
+    void cmp(const PrefetchDirective &a, const PrefetchDirective &b) {
+        cmp(a.name, b.name);
+        cmp(a.at, b.at);
+        cmp(a.from, b.from);
+        cmp(a.offset, b.offset);
+        cmp(a.strategy, b.strategy);
     }
 
-    if (compare_types(a.type(), b.type()) != Equal) {
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(double a, double b) {
+        // Floating point scalars need special handling, due to NaNs.
+        if (std::isnan(a) && std::isnan(b)) {
+        } else if (std::isnan(a)) {
+            result = LessThan;
+        } else if (std::isnan(b)) {
+            result = GreaterThan;
+        } else if (a < b) {
+            result = LessThan;
+        } else if (b < a) {
+            result = GreaterThan;
+        }
     }
 
-    // Check the cache - perhaps these exprs have already been compared and found equal.
-    if (cache && cache->contains(a, b)) {
-        result = Equal;
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const std::string &a, const std::string &b) {
+        int r = a.compare(b);
+        if (r < 0) {
+            result = LessThan;
+        } else if (r > 0) {
+            result = GreaterThan;
+        }
     }
 
-    expr = a;
-    b.accept(this);
-
-    if (cache && result == Equal) {
-        cache->insert(a, b);
+    // The method to use whenever we can just use operator< and get a bool.
+    template<typename T, typename = std::enable_if_t<!std::is_convertible_v<T, IRHandle> &&
+                                                     std::is_same_v<decltype(std::declval<T>() < std::declval<T>()), bool>>>
+    HALIDE_NEVER_INLINE void cmp(const T &a, const T &b) {
+        if (a < b) {
+            result = LessThan;
+        } else if (b < a) {
+            result = GreaterThan;
+        }
     }
 
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_stmt(const Stmt &a, const Stmt &b) {
-    if (result != Equal) {
-        return result;
-    }
+    CmpResult compare(const IRNode *root_a, const IRNode *root_b) {
+        constexpr size_t stack_size = 64;             // 1 kb
+        const IRNode *stack_storage[stack_size * 2];  // Intentionally uninitialized
 
-    if (a.same_as(b)) {
+        stack_ptr = stack_storage;
+        stack_end = stack_storage + stack_size * 2;
         result = Equal;
-        return result;
-    }
 
-    if (!a.defined() && !b.defined()) {
-        result = Equal;
-        return result;
-    }
-
-    if (!a.defined()) {
-        result = LessThan;
-        return result;
-    }
-
-    if (!b.defined()) {
-        result = GreaterThan;
-        return result;
-    }
-
-    if (compare_scalar(a->node_type, b->node_type) != Equal) {
-        return result;
-    }
-
-    stmt = a;
-    b.accept(this);
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_types(Type a, Type b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    compare_scalar(a.code(), b.code());
-    compare_scalar(a.bits(), b.bits());
-    compare_scalar(a.lanes(), b.lanes());
-
-    if (result != Equal) {
-        return result;
-    }
-
-    const halide_handle_cplusplus_type *ha = a.handle_type;
-    const halide_handle_cplusplus_type *hb = b.handle_type;
-
-    if (ha == hb) {
-        // Same handle type, or both not handles, or both void *
-        return result;
-    }
-
-    if (ha == nullptr) {
-        // void* < T*
-        result = LessThan;
-        return result;
-    }
-
-    if (hb == nullptr) {
-        // T* > void*
-        result = GreaterThan;
-        return result;
-    }
-
-    // They're both non-void handle types with distinct type info
-    // structs. We now need to distinguish between different C++
-    // pointer types (e.g. char * vs const float *). If would be nice
-    // if the structs were unique per C++ type. Then comparing the
-    // pointers above would be sufficient.  Unfortunately, different
-    // shared libraries in the same process each create a distinct
-    // struct for the same type. We therefore have to do a deep
-    // comparison of the type info fields.
-
-    compare_scalar(ha->reference_type, hb->reference_type);
-    compare_names(ha->inner_name.name, hb->inner_name.name);
-    compare_scalar(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type);
-    compare_scalar(ha->namespaces.size(), hb->namespaces.size());
-    compare_scalar(ha->enclosing_types.size(), hb->enclosing_types.size());
-    compare_scalar(ha->cpp_type_modifiers.size(), hb->cpp_type_modifiers.size());
-
-    if (result != Equal) {
-        return result;
-    }
-
-    for (size_t i = 0; i < ha->namespaces.size(); i++) {
-        compare_names(ha->namespaces[i], hb->namespaces[i]);
-    }
-
-    if (result != Equal) {
-        return result;
-    }
-
-    for (size_t i = 0; i < ha->enclosing_types.size(); i++) {
-        compare_scalar(ha->enclosing_types[i].cpp_type_type,
-                       hb->enclosing_types[i].cpp_type_type);
-        compare_names(ha->enclosing_types[i].name,
-                      hb->enclosing_types[i].name);
-    }
-
-    if (result != Equal) {
-        return result;
-    }
-
-    for (size_t i = 0; i < ha->cpp_type_modifiers.size(); i++) {
-        compare_scalar(ha->cpp_type_modifiers[i],
-                       hb->cpp_type_modifiers[i]);
-    }
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_names(const string &a, const string &b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    int string_cmp = a.compare(b);
-    if (string_cmp < 0) {
-        result = LessThan;
-    } else if (string_cmp > 0) {
-        result = GreaterThan;
-    }
-
-    return result;
-}
+        *stack_ptr++ = root_a;
+        *stack_ptr++ = root_b;
+
+        while (result == Equal && stack_ptr > stack_storage) {
+            stack_ptr -= 2;
+            next_a = stack_ptr[0];
+            next_b = stack_ptr[1];
+
+            if (next_a == next_b) {
+                continue;
+            }
+
+            if (cache_size > 0 && (((uintptr_t)next_a) & 1)) {
+                // If we are using a cache, we want to keep the nodes on the
+                // stack while processing their children, but mark them with a
+                // tombstone. We'll flip the low bit to 1 for our tombstone. We
+                // want to insert them into the cache when the tombstone is
+                // handled. This if statement triggers if we just hit a
+                // tombstone.
+                cache_insert((const IRNode *)((uintptr_t)next_a ^ 1), next_b);
+                continue;
+            }
+
+            cmp(next_a->node_type, next_b->node_type);
+            if (result != Equal) {
+                break;
+            }
+
+            if (next_a->node_type < IRNodeType::LetStmt) {
+                cmp(&BaseExprNode::type);
+            }
+
+            if (cache_size > 0) {
+                // Keep the parent nodes on the stack, but mark them with a
+                // tombstone bit.
+                stack_ptr[0] = (const IRNode *)(((uintptr_t)next_a) | 1);
+                stack_ptr += 2;
+            }
+
+            switch (next_a->node_type) {
+            case IRNodeType::IntImm:
+                cmp(&IntImm::value);
+                break;
+            case IRNodeType::UIntImm:
+                cmp(&UIntImm::value);
+                break;
+            case IRNodeType::FloatImm:
+                cmp(&FloatImm::value);
+                break;
+            case IRNodeType::StringImm:
+                cmp(&StringImm::value);
+                break;
+            case IRNodeType::Broadcast:
+                cmp(&Broadcast::value);
+                break;
+            case IRNodeType::Cast:
+                cmp(&Cast::value);
+                break;
+            case IRNodeType::Reinterpret:
+                cmp(&Cast::value);
+                break;
+            case IRNodeType::Variable:
+                cmp(&Variable::name);
+                break;
+            case IRNodeType::Add:
+                cmp(&Add::a);
+                cmp(&Add::b);
+                break;
+            case IRNodeType::Sub:
+                cmp(&Sub::a);
+                cmp(&Sub::b);
+                break;
+            case IRNodeType::Mod:
+                cmp(&Mod::a);
+                cmp(&Mod::b);
+                break;
+            case IRNodeType::Mul:
+                cmp(&Mul::a);
+                cmp(&Mul::b);
+                break;
+            case IRNodeType::Div:
+                cmp(&Div::a);
+                cmp(&Div::b);
+                break;
+            case IRNodeType::Min:
+                cmp(&Min::a);
+                cmp(&Min::b);
+                break;
+            case IRNodeType::Max:
+                cmp(&Max::a);
+                cmp(&Max::b);
+                break;
+            case IRNodeType::EQ:
+                cmp(&EQ::a);
+                cmp(&EQ::b);
+                break;
+            case IRNodeType::NE:
+                cmp(&NE::a);
+                cmp(&NE::b);
+                break;
+            case IRNodeType::LT:
+                cmp(&LT::a);
+                cmp(&LT::b);
+                break;
+            case IRNodeType::LE:
+                cmp(&LE::a);
+                cmp(&LE::b);
+                break;
+            case IRNodeType::GT:
+                cmp(&GT::a);
+                cmp(&GT::b);
+            case IRNodeType::GE:
+                cmp(&GE::a);
+                cmp(&GE::b);
+                break;
+            case IRNodeType::And:
+                cmp(&And::a);
+                cmp(&And::b);
+                break;
+            case IRNodeType::Or:
+                cmp(&Or::a);
+                cmp(&Or::b);
+                break;
+            case IRNodeType::Not:
+                cmp(&Not::a);
+                break;
+            case IRNodeType::Select:
+                cmp(&Select::condition);
+                cmp(&Select::true_value);
+                cmp(&Select::false_value);
+                break;
+            case IRNodeType::Load:
+                cmp(&Load::name);
+                cmp(&Load::alignment);
+                cmp(&Load::index);
+                cmp(&Load::predicate);
+                break;
+            case IRNodeType::Ramp:
+                cmp(&Ramp::stride);
+                cmp(&Ramp::base);
+                break;
+            case IRNodeType::Call:
+                cmp(&Call::name);
+                cmp(&Call::call_type);
+                cmp(&Call::value_index);
+                cmp(&Call::args);
+                break;
+            case IRNodeType::Let:
+                cmp(&Let::name);
+                cmp(&Let::value);
+                cmp(&Let::body);
+                break;
+            case IRNodeType::Shuffle:
+                cmp(&Shuffle::indices);
+                cmp(&Shuffle::vectors);
+                break;
+            case IRNodeType::VectorReduce:
+                cmp(&VectorReduce::op);
+                cmp(&VectorReduce::value);
+                break;
+            case IRNodeType::LetStmt:
+                cmp(&LetStmt::name);
+                cmp(&LetStmt::value);
+                cmp(&LetStmt::body);
+                break;
+            case IRNodeType::AssertStmt:
+                cmp(&AssertStmt::condition);
+                cmp(&AssertStmt::message);
+                break;
+            case IRNodeType::ProducerConsumer:
+                cmp(&ProducerConsumer::name);
+                cmp(&ProducerConsumer::is_producer);
+                cmp(&ProducerConsumer::body);
+                break;
+            case IRNodeType::For:
+                cmp(&For::name);
+                cmp(&For::for_type);
+                cmp(&For::device_api);
+                cmp(&For::partition_policy);
+                cmp(&For::min);
+                cmp(&For::extent);
+                cmp(&For::body);
+                break;
+            case IRNodeType::Acquire:
+                cmp(&Acquire::semaphore);
+                cmp(&Acquire::count);
+                cmp(&Acquire::body);
+                break;
+            case IRNodeType::Store:
+                cmp(&Store::name);
+                cmp(&Store::alignment);
+                cmp(&Store::predicate);
+                cmp(&Store::value);
+                cmp(&Store::index);
+                break;
+            case IRNodeType::Provide:
+                cmp(&Provide::name);
+                cmp(&Provide::args);
+                cmp(&Provide::values);
+                break;
+            case IRNodeType::Allocate:
+                cmp(&Allocate::name);
+                cmp(&Allocate::type);
+                cmp(&Allocate::free_function);
+                cmp_if_defined(&Allocate::new_expr);
+                cmp(&Allocate::condition);
+                cmp(&Allocate::extents);
+                cmp(&Allocate::body);
+                break;
+            case IRNodeType::Free:
+                cmp(&Free::name);
+                break;
+            case IRNodeType::Realize:
+                cmp(&Realize::name);
+                cmp(&Realize::types);
+                cmp(&Realize::bounds);
+                cmp(&Realize::body);
+                cmp(&Realize::condition);
+                break;
+            case IRNodeType::Block:
+                cmp(&Block::first);
+                cmp(&Block::rest);
+                break;
+            case IRNodeType::Fork:
+                cmp(&Fork::first);
+                cmp(&Fork::rest);
+                break;
+            case IRNodeType::IfThenElse:
+                cmp(&IfThenElse::condition);
+                cmp(&IfThenElse::then_case);
+                cmp_if_defined(&IfThenElse::else_case);
+                break;
+            case IRNodeType::Evaluate:
+                cmp(&Evaluate::value);
+                break;
+            case IRNodeType::Prefetch:
+                cmp(&Prefetch::name);
+                cmp(&Prefetch::types);
+                cmp(&Prefetch::prefetch);
+                cmp(&Prefetch::bounds);
+                cmp(&Prefetch::condition);
+                cmp(&Prefetch::body);
+                break;
+            case IRNodeType::Atomic:
+                cmp(&Atomic::producer_name);
+                cmp(&Atomic::mutex_name);
+                cmp(&Atomic::body);
+                break;
+            case IRNodeType::HoistedStorage:
+                cmp(&HoistedStorage::name);
+                cmp(&HoistedStorage::body);
+                break;
+            }
+        }
 
-IRComparer::CmpResult IRComparer::compare_expr_vector(const vector<Expr> &a, const vector<Expr> &b) {
-    if (result != Equal) {
+        // Don't hold onto pointers to this stack frame.
+        stack_ptr = stack_end = nullptr;
         return result;
     }
+};
 
-    compare_scalar(a.size(), b.size());
-    for (size_t i = 0; (i < a.size()) && result == Equal; i++) {
-        compare_expr(a[i], b[i]);
-    }
-
-    return result;
-}
-
-void IRComparer::visit(const IntImm *op) {
-    const IntImm *e = expr.as<IntImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const UIntImm *op) {
-    const UIntImm *e = expr.as<UIntImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const FloatImm *op) {
-    const FloatImm *e = expr.as<FloatImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const StringImm *op) {
-    const StringImm *e = expr.as<StringImm>();
-    compare_names(e->value, op->value);
-}
-
-void IRComparer::visit(const Cast *op) {
-    compare_expr(expr.as<Cast>()->value, op->value);
-}
-
-void IRComparer::visit(const Reinterpret *op) {
-    compare_expr(expr.as<Reinterpret>()->value, op->value);
-}
-
-void IRComparer::visit(const Variable *op) {
-    const Variable *e = expr.as<Variable>();
-    compare_names(e->name, op->name);
-}
-
-namespace {
-template<typename T>
-void visit_binary_operator(IRComparer *cmp, const T *op, Expr expr) {
-    const T *e = expr.as<T>();
-    cmp->compare_expr(e->a, op->a);
-    cmp->compare_expr(e->b, op->b);
-}
-}  // namespace
-
-void IRComparer::visit(const Add *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Sub *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Mul *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Div *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Mod *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Min *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Max *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const EQ *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const NE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const LT *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const LE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const GT *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const GE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const And *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Or *op) {
-    visit_binary_operator(this, op, expr);
-}
-
-void IRComparer::visit(const Not *op) {
-    const Not *e = expr.as<Not>();
-    compare_expr(e->a, op->a);
-}
-
-void IRComparer::visit(const Select *op) {
-    const Select *e = expr.as<Select>();
-    compare_expr(e->condition, op->condition);
-    compare_expr(e->true_value, op->true_value);
-    compare_expr(e->false_value, op->false_value);
-}
-
-void IRComparer::visit(const Load *op) {
-    const Load *e = expr.as<Load>();
-    compare_names(op->name, e->name);
-    compare_expr(e->predicate, op->predicate);
-    compare_expr(e->index, op->index);
-    compare_scalar(e->alignment.modulus, op->alignment.modulus);
-    compare_scalar(e->alignment.remainder, op->alignment.remainder);
-}
-
-void IRComparer::visit(const Ramp *op) {
-    const Ramp *e = expr.as<Ramp>();
-    // No need to compare width because we already compared types
-    compare_expr(e->base, op->base);
-    compare_expr(e->stride, op->stride);
-}
-
-void IRComparer::visit(const Broadcast *op) {
-    const Broadcast *e = expr.as<Broadcast>();
-    compare_expr(e->value, op->value);
-}
-
-void IRComparer::visit(const Call *op) {
-    const Call *e = expr.as<Call>();
-
-    compare_names(e->name, op->name);
-    compare_scalar(e->call_type, op->call_type);
-    compare_scalar(e->value_index, op->value_index);
-    compare_expr_vector(e->args, op->args);
-}
-
-void IRComparer::visit(const Let *op) {
-    const Let *e = expr.as<Let>();
-
-    compare_names(e->name, op->name);
-    compare_expr(e->value, op->value);
-    compare_expr(e->body, op->body);
-}
-
-void IRComparer::visit(const LetStmt *op) {
-    const LetStmt *s = stmt.as<LetStmt>();
-
-    compare_names(s->name, op->name);
-    compare_expr(s->value, op->value);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const AssertStmt *op) {
-    const AssertStmt *s = stmt.as<AssertStmt>();
-
-    compare_expr(s->condition, op->condition);
-    compare_expr(s->message, op->message);
-}
-
-void IRComparer::visit(const ProducerConsumer *op) {
-    const ProducerConsumer *s = stmt.as<ProducerConsumer>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->is_producer, op->is_producer);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const For *op) {
-    const For *s = stmt.as<For>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->for_type, op->for_type);
-    compare_expr(s->min, op->min);
-    compare_expr(s->extent, op->extent);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Acquire *op) {
-    const Acquire *s = stmt.as<Acquire>();
-
-    compare_expr(s->semaphore, op->semaphore);
-    compare_expr(s->count, op->count);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Store *op) {
-    const Store *s = stmt.as<Store>();
-
-    compare_names(s->name, op->name);
-
-    compare_expr(s->predicate, op->predicate);
-    compare_expr(s->value, op->value);
-    compare_expr(s->index, op->index);
-    compare_scalar(s->alignment.modulus, op->alignment.modulus);
-    compare_scalar(s->alignment.remainder, op->alignment.remainder);
-}
-
-void IRComparer::visit(const Provide *op) {
-    const Provide *s = stmt.as<Provide>();
-
-    compare_names(s->name, op->name);
-    compare_expr_vector(s->args, op->args);
-    compare_expr_vector(s->values, op->values);
-}
-
-void IRComparer::visit(const Allocate *op) {
-    const Allocate *s = stmt.as<Allocate>();
-
-    compare_names(s->name, op->name);
-    compare_types(s->type, op->type);
-    compare_expr_vector(s->extents, op->extents);
-    compare_stmt(s->body, op->body);
-    compare_expr(s->condition, op->condition);
-    compare_expr(s->new_expr, op->new_expr);
-    compare_names(s->free_function, op->free_function);
-}
-
-void IRComparer::visit(const Realize *op) {
-    const Realize *s = stmt.as<Realize>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->types.size(), op->types.size());
-    compare_scalar(s->bounds.size(), op->bounds.size());
-    for (size_t i = 0; (result == Equal) && (i < s->types.size()); i++) {
-        compare_types(s->types[i], op->types[i]);
+template<bool use_cache>
+bool ir_equal(const IRHandle &a, const IRHandle &b) {
+    // Early out for the most common cases.
+    if (a.get() == b.get()) {
+        return true;
+    } else if (a.defined() != b.defined() ||
+               a.node_type() != b.node_type()) {
+        return false;
     }
-    for (size_t i = 0; (result == Equal) && (i < s->bounds.size()); i++) {
-        compare_expr(s->bounds[i].min, op->bounds[i].min);
-        compare_expr(s->bounds[i].extent, op->bounds[i].extent);
+    if (use_cache) {
+        const IRNode *cache[256] = {0};
+        return Comparer<128>(cache).compare(a.get(), b.get()) == Equal;
+    } else {
+        return Comparer<0>(nullptr).compare(a.get(), b.get()) == Equal;
     }
-    compare_stmt(s->body, op->body);
-    compare_expr(s->condition, op->condition);
-}
-
-void IRComparer::visit(const Block *op) {
-    const Block *s = stmt.as<Block>();
-
-    compare_stmt(s->first, op->first);
-    compare_stmt(s->rest, op->rest);
-}
-
-void IRComparer::visit(const Fork *op) {
-    const Fork *s = stmt.as<Fork>();
-
-    compare_stmt(s->first, op->first);
-    compare_stmt(s->rest, op->rest);
-}
-
-void IRComparer::visit(const Free *op) {
-    const Free *s = stmt.as<Free>();
-
-    compare_names(s->name, op->name);
-}
-
-void IRComparer::visit(const IfThenElse *op) {
-    const IfThenElse *s = stmt.as<IfThenElse>();
-
-    compare_expr(s->condition, op->condition);
-    compare_stmt(s->then_case, op->then_case);
-    compare_stmt(s->else_case, op->else_case);
-}
-
-void IRComparer::visit(const Evaluate *op) {
-    const Evaluate *s = stmt.as<Evaluate>();
-
-    compare_expr(s->value, op->value);
 }
 
-void IRComparer::visit(const Shuffle *op) {
-    const Shuffle *e = expr.as<Shuffle>();
-
-    compare_expr_vector(e->vectors, op->vectors);
-
-    compare_scalar(e->indices.size(), op->indices.size());
-    for (size_t i = 0; (i < e->indices.size()) && result == Equal; i++) {
-        compare_scalar(e->indices[i], op->indices[i]);
+template<bool use_cache>
+bool ir_less_than(const IRHandle &a, const IRHandle &b) {
+    // Early out for the most common cases
+    if (a.get() == b.get()) {
+        return false;
+    } else if (!a.defined()) {
+        return true;
+    } else if (!b.defined()) {
+        return false;
     }
-}
 
-void IRComparer::visit(const Prefetch *op) {
-    const Prefetch *s = stmt.as<Prefetch>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->types.size(), op->types.size());
-    compare_scalar(s->bounds.size(), op->bounds.size());
-    for (size_t i = 0; (result == Equal) && (i < s->types.size()); i++) {
-        compare_types(s->types[i], op->types[i]);
-    }
-    for (size_t i = 0; (result == Equal) && (i < s->bounds.size()); i++) {
-        compare_expr(s->bounds[i].min, op->bounds[i].min);
-        compare_expr(s->bounds[i].extent, op->bounds[i].extent);
+    if (use_cache) {
+        const IRNode *cache[256] = {0};
+        return Comparer<128>(cache).compare(a.get(), b.get()) == LessThan;
+    } else {
+        return Comparer<0>(nullptr).compare(a.get(), b.get()) == LessThan;
     }
-    compare_expr(s->condition, op->condition);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Atomic *op) {
-    const Atomic *s = stmt.as<Atomic>();
-
-    compare_names(s->producer_name, op->producer_name);
-    compare_names(s->mutex_name, op->mutex_name);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const VectorReduce *op) {
-    const VectorReduce *e = expr.as<VectorReduce>();
-
-    compare_scalar(op->op, e->op);
-    // We've already compared types, so it's enough to compare the value
-    compare_expr(op->value, e->value);
-}
-
-void IRComparer::visit(const HoistedStorage *op) {
-    const HoistedStorage *s = stmt.as<HoistedStorage>();
-
-    compare_names(s->name, op->name);
-    compare_stmt(s->body, op->body);
 }
 
 }  // namespace
 
-// Now the methods exposed in the header.
 bool equal(const Expr &a, const Expr &b) {
-    return IRComparer().compare_expr(a, b) == IRComparer::Equal;
+    return ir_equal<false>(a, b);
 }
 
-bool graph_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_expr(a, b) == IRComparer::Equal;
+bool equal(const Stmt &a, const Stmt &b) {
+    return ir_equal<false>(a, b);
 }
 
-bool graph_less_than(const Expr &a, const Expr &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_expr(a, b) == IRComparer::LessThan;
+bool graph_equal(const Expr &a, const Expr &b) {
+    return ir_equal<true>(a, b);
 }
 
-bool equal(const Stmt &a, const Stmt &b) {
-    return IRComparer().compare_stmt(a, b) == IRComparer::Equal;
+bool graph_equal(const Stmt &a, const Stmt &b) {
+    return ir_equal<true>(a, b);
 }
 
-bool graph_equal(const Stmt &a, const Stmt &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_stmt(a, b) == IRComparer::Equal;
+bool graph_less_than(const Expr &a, const Expr &b) {
+    return ir_less_than<true>(a, b);
 }
 
 bool graph_less_than(const Stmt &a, const Stmt &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_stmt(a, b) == IRComparer::LessThan;
+    return ir_less_than<true>(a, b);
 }
 
 bool IRDeepCompare::operator()(const Expr &a, const Expr &b) const {
-    IRComparer cmp;
-    cmp.compare_expr(a, b);
-    return cmp.result == IRComparer::LessThan;
+    return ir_less_than<false>(a, b);
 }
 
 bool IRDeepCompare::operator()(const Stmt &a, const Stmt &b) const {
-    IRComparer cmp;
-    cmp.compare_stmt(a, b);
-    return cmp.result == IRComparer::LessThan;
+    return ir_less_than<false>(a, b);
+}
+
+bool IRGraphDeepCompare::operator()(const Expr &a, const Expr &b) const {
+    return ir_less_than<true>(a, b);
 }
 
-bool ExprWithCompareCache::operator<(const ExprWithCompareCache &other) const {
-    IRComparer cmp(cache);
-    cmp.compare_expr(expr, other.expr);
-    return cmp.result == IRComparer::LessThan;
+bool IRGraphDeepCompare::operator()(const Stmt &a, const Stmt &b) const {
+    return ir_less_than<true>(a, b);
 }
 
 // Testing code
 namespace {
 
-IRComparer::CmpResult flip_result(IRComparer::CmpResult r) {
+CmpResult flip_result(CmpResult r) {
     switch (r) {
-    case IRComparer::LessThan:
-        return IRComparer::GreaterThan;
-    case IRComparer::Equal:
-        return IRComparer::Equal;
-    case IRComparer::GreaterThan:
-        return IRComparer::LessThan;
-    case IRComparer::Unknown:
-        return IRComparer::Unknown;
+    case LessThan:
+        return GreaterThan;
+    case Equal:
+        return Equal;
+    case GreaterThan:
+        return LessThan;
+    case Unknown:
+        return Unknown;
     }
-    return IRComparer::Unknown;
+    return Unknown;
 }
 
 void check_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(5);
-    IRComparer::CmpResult r = IRComparer(&cache).compare_expr(a, b);
-    internal_assert(r == IRComparer::Equal)
+    const IRNode *cache[256] = {0};
+    CmpResult r = Comparer<128>(cache).compare(a.get(), b.get());
+    internal_assert(r == Equal)
         << "Error in ir_equality_test: " << r
-        << " instead of " << IRComparer::Equal
+        << " instead of " << Equal
         << " when comparing:\n"
         << a
         << "\nand\n"
@@ -724,11 +601,11 @@ void check_equal(const Expr &a, const Expr &b) {
 }
 
 void check_not_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(5);
-    IRComparer::CmpResult r1 = IRComparer(&cache).compare_expr(a, b);
-    IRComparer::CmpResult r2 = IRComparer(&cache).compare_expr(b, a);
-    internal_assert(r1 != IRComparer::Equal &&
-                    r1 != IRComparer::Unknown &&
+    const IRNode *cache[256] = {0};
+    CmpResult r1 = Comparer<128>(cache).compare(a.get(), b.get());
+    CmpResult r2 = Comparer<128>(cache).compare(b.get(), a.get());
+    internal_assert(r1 != Equal &&
+                    r1 != Unknown &&
                     flip_result(r1) == r2)
         << "Error in ir_equality_test: " << r1
         << " is not the opposite of " << r2
diff --git a/src/IREquality.h b/src/IREquality.h
index 1d59d19446e2..a4070f50ef96 100644
--- a/src/IREquality.h
+++ b/src/IREquality.h
@@ -17,92 +17,9 @@ struct IRDeepCompare {
     bool operator()(const Stmt &a, const Stmt &b) const;
 };
 
-/** Lossily track known equal exprs with a cache. On collision, the
- * old pair is evicted. Used below by ExprWithCompareCache. */
-class IRCompareCache {
-private:
-    struct Entry {
-        Expr a, b;
-    };
-
-    int bits;
-
-    uint32_t hash(const Expr &a, const Expr &b) const {
-        // Note this hash is symmetric in a and b, so that a
-        // comparison in a and b hashes to the same bucket as
-        // a comparison on b and a.
-        uint64_t pa = (uint64_t)(a.get());
-        uint64_t pb = (uint64_t)(b.get());
-        uint64_t mix = (pa + pb) + (pa ^ pb);
-        mix ^= (mix >> bits);
-        mix ^= (mix >> (bits * 2));
-        uint32_t bottom = mix & ((1 << bits) - 1);
-        return bottom;
-    }
-
-    std::vector<Entry> entries;
-
-public:
-    void insert(const Expr &a, const Expr &b) {
-        uint32_t h = hash(a, b);
-        entries[h].a = a;
-        entries[h].b = b;
-    }
-
-    bool contains(const Expr &a, const Expr &b) const {
-        uint32_t h = hash(a, b);
-        const Entry &e = entries[h];
-        return ((a.same_as(e.a) && b.same_as(e.b)) ||
-                (a.same_as(e.b) && b.same_as(e.a)));
-    }
-
-    void clear() {
-        for (auto &entry : entries) {
-            entry.a = Expr();
-            entry.b = Expr();
-        }
-    }
-
-    IRCompareCache() = default;
-    IRCompareCache(int b)
-        : bits(b), entries(static_cast<size_t>(1) << bits) {
-    }
-};
-
-/** A wrapper about Exprs so that they can be deeply compared with a
- * cache for known-equal subexpressions. Useful for unsanitized Exprs
- * coming in from the front-end, which may be horrible graphs with
- * sub-expressions that are equal by value but not by identity. This
- * isn't a comparison object like IRDeepCompare above, because libc++
- * requires that comparison objects be stateless (and constructs a new
- * one for each comparison!), so they can't have a cache associated
- * with them. However, by sneakily making the cache a mutable member
- * of the objects being compared, we can dodge this issue.
- *
- * Clunky example usage:
- *
-\code
-Expr a, b, c, query;
-std::set<ExprWithCompareCache> s;
-IRCompareCache cache(8);
-s.insert(ExprWithCompareCache(a, &cache));
-s.insert(ExprWithCompareCache(b, &cache));
-s.insert(ExprWithCompareCache(c, &cache));
-if (m.contains(ExprWithCompareCache(query, &cache))) {...}
-\endcode
- *
- */
-struct ExprWithCompareCache {
-    Expr expr;
-    mutable IRCompareCache *cache = nullptr;
-
-    ExprWithCompareCache() = default;
-    ExprWithCompareCache(const Expr &e, IRCompareCache *c)
-        : expr(e), cache(c) {
-    }
-
-    /** The comparison uses (and updates) the cache */
-    bool operator<(const ExprWithCompareCache &other) const;
+struct IRGraphDeepCompare {
+    bool operator()(const Expr &a, const Expr &b) const;
+    bool operator()(const Stmt &a, const Stmt &b) const;
 };
 
 /** Compare IR nodes for equality of value. Traverses entire IR
diff --git a/src/ParallelRVar.cpp b/src/ParallelRVar.cpp
index c210e487f3ad..a79b9eac56ba 100644
--- a/src/ParallelRVar.cpp
+++ b/src/ParallelRVar.cpp
@@ -145,7 +145,7 @@ bool can_parallelize_rvar(const string &v,
     }
 
     // Add the definition's predicate if there is any
-    if (pred.defined() || !equal(const_true(), pred)) {
+    if (pred.defined() || !is_const_one(pred)) {
         Expr this_pred = pred;
         Expr other_pred = renamer.mutate(pred);
         debug(3) << "......this thread predicate: " << this_pred << "\n";
diff --git a/src/RDom.cpp b/src/RDom.cpp
index 46d10fc8db10..b7859bafc2b7 100644
--- a/src/RDom.cpp
+++ b/src/RDom.cpp
@@ -264,7 +264,7 @@ std::ostream &operator<<(std::ostream &stream, const RDom &dom) {
     }
     stream << ")";
     Expr pred = simplify(dom.domain().predicate());
-    if (!equal(const_true(), pred)) {
+    if (!is_const_one(pred)) {
         stream << " where (\n  " << pred << ")";
     }
     stream << "\n";
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 8fa2fd71a7a2..b5d8f35aac28 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -514,7 +514,7 @@ Stmt build_provide_loop_nest(const map<string, Function> &env,
             Stmt then_case = build_provide_loop_nest(env, prefix, func, s.definition, start_fuse, is_update);
             stmt = IfThenElse::make(s.condition, then_case, stmt);
         } else {
-            internal_assert(equal(s.condition, const_true()));
+            internal_assert(is_const_one(s.condition));
             // specialize_fail() should only be possible on the final specialization
             internal_assert(i == specializations.size());
             Expr specialize_fail_error =

From 7a6051977d2297802f4283c44f2f97205464be72 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 17 Apr 2024 09:33:21 -0700
Subject: [PATCH 02/13] clang-tidy

---
 src/IREquality.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 90ec38970f5d..620926ca3ec5 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -503,7 +503,7 @@ bool ir_equal(const IRHandle &a, const IRHandle &b) {
         return false;
     }
     if (use_cache) {
-        const IRNode *cache[256] = {0};
+        const IRNode *cache[256] = {};
         return Comparer<128>(cache).compare(a.get(), b.get()) == Equal;
     } else {
         return Comparer<0>(nullptr).compare(a.get(), b.get()) == Equal;
@@ -522,7 +522,7 @@ bool ir_less_than(const IRHandle &a, const IRHandle &b) {
     }
 
     if (use_cache) {
-        const IRNode *cache[256] = {0};
+        const IRNode *cache[256] = {};
         return Comparer<128>(cache).compare(a.get(), b.get()) == LessThan;
     } else {
         return Comparer<0>(nullptr).compare(a.get(), b.get()) == LessThan;
@@ -589,7 +589,7 @@ CmpResult flip_result(CmpResult r) {
 }
 
 void check_equal(const Expr &a, const Expr &b) {
-    const IRNode *cache[256] = {0};
+    const IRNode *cache[256] = {};
     CmpResult r = Comparer<128>(cache).compare(a.get(), b.get());
     internal_assert(r == Equal)
         << "Error in ir_equality_test: " << r
@@ -601,7 +601,7 @@ void check_equal(const Expr &a, const Expr &b) {
 }
 
 void check_not_equal(const Expr &a, const Expr &b) {
-    const IRNode *cache[256] = {0};
+    const IRNode *cache[256] = {};
     CmpResult r1 = Comparer<128>(cache).compare(a.get(), b.get());
     CmpResult r2 = Comparer<128>(cache).compare(b.get(), a.get());
     internal_assert(r1 != Equal &&

From 150f5e929e7ae018b23f1f8cf59d36549a0ebf2a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 17 Apr 2024 10:44:09 -0700
Subject: [PATCH 03/13] Fold in the version of equal in IRMatch.h/cpp

---
 src/Associativity.cpp |   2 +-
 src/IREquality.cpp    | 182 +++++++++++++++---------------------------
 src/IREquality.h      | 166 ++++++++++++++++++++++++++++++++------
 src/IRMatch.cpp       | 144 ---------------------------------
 src/IRMatch.h         |  12 ---
 5 files changed, 209 insertions(+), 297 deletions(-)

diff --git a/src/Associativity.cpp b/src/Associativity.cpp
index 39a0011391a6..6baa9e5fa7c6 100644
--- a/src/Associativity.cpp
+++ b/src/Associativity.cpp
@@ -145,7 +145,7 @@ bool associative_op_pattern_match(const Expr &e,
                 debug(5) << "Adding result: " << iter.first << " -> " << iter.second << "\n";
                 match.emplace(iter.first, iter.second);
             } else {
-                if (!equal(iter.first, match_iter->first) || !equal(iter.second, match_iter->second)) {
+                if (iter.first != match_iter->first || !equal(iter.second, match_iter->second)) {
                     return false;
                 }
             }
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 620926ca3ec5..a22f52de8ba0 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -10,10 +10,9 @@ using std::vector;
 
 namespace {
 
-enum CmpResult { Unknown,
-                 Equal,
-                 LessThan,
-                 GreaterThan };
+enum class Order { Equal,
+                   LessThan,
+                   GreaterThan };
 
 // A helper class for comparing two pieces of IR with the minimum amount of
 // recursion.
@@ -32,7 +31,7 @@ struct Comparer {
     // IR, and the result of the comparison so far.
     const IRNode **stack_end = nullptr, **stack_ptr = nullptr;
     const IRNode *next_a = nullptr, *next_b = nullptr;
-    CmpResult result = Equal;
+    Order result = Order::Equal;
 
     Comparer(const IRNode **cache)
         : cache(cache) {
@@ -42,7 +41,7 @@ struct Comparer {
     // or Stmt, it's guaranteed to be defined.
     template<typename Node, typename MemberType>
     HALIDE_ALWAYS_INLINE void cmp(MemberType Node::*member_ptr) {
-        if (result == Equal) {
+        if (result == Order::Equal) {
             cmp(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
         }
     }
@@ -50,7 +49,7 @@ struct Comparer {
     // The same as above, but with no guarantee.
     template<typename Node, typename MemberType>
     HALIDE_ALWAYS_INLINE void cmp_if_defined(MemberType Node::*member_ptr) {
-        if (result == Equal) {
+        if (result == Order::Equal) {
             cmp_if_defined(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
         }
     }
@@ -95,7 +94,7 @@ struct Comparer {
         } else if (stack_ptr == stack_end) {
             // Out of stack space. Make a recursive call to buy some more stack.
             Comparer<cache_size> sub_comparer(cache);
-            result = sub_comparer.compare(a.get(), b.get());
+            result = sub_comparer.compare(*(a.get()), *(b.get()));
         } else {
             *stack_ptr++ = a.get();
             *stack_ptr++ = b.get();
@@ -106,9 +105,9 @@ struct Comparer {
     HALIDE_ALWAYS_INLINE
     void cmp_if_defined(const IRHandle &a, const IRHandle &b) {
         if (a.defined() < b.defined()) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (a.defined() > b.defined()) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         } else if (a.defined() && b.defined()) {
             cmp(a, b);
         }
@@ -117,11 +116,11 @@ struct Comparer {
     template<typename T>
     void cmp(const std::vector<T> &a, const std::vector<T> &b) {
         if (a.size() < b.size()) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (a.size() > b.size()) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         } else {
-            for (size_t i = 0; i < a.size() && result == Equal; i++) {
+            for (size_t i = 0; i < a.size() && result == Order::Equal; i++) {
                 cmp(a[i], b[i]);
             }
         }
@@ -144,9 +143,9 @@ struct Comparer {
         if (ha == hb) {
             return;
         } else if (!ha) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (!hb) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         } else {
             // They're both non-void handle types with distinct type info
             // structs. We now need to distinguish between different C++
@@ -170,9 +169,9 @@ struct Comparer {
         uint32_t ta = ((halide_type_t)a).as_u32();
         uint32_t tb = ((halide_type_t)b).as_u32();
         if (ta < tb) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (ta > tb) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         } else {
             if (a.handle_type || b.handle_type) {
                 cmp(a.handle_type, b.handle_type);
@@ -193,13 +192,13 @@ struct Comparer {
         // Floating point scalars need special handling, due to NaNs.
         if (std::isnan(a) && std::isnan(b)) {
         } else if (std::isnan(a)) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (std::isnan(b)) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         } else if (a < b) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (b < a) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         }
     }
 
@@ -207,9 +206,9 @@ struct Comparer {
     void cmp(const std::string &a, const std::string &b) {
         int r = a.compare(b);
         if (r < 0) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (r > 0) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         }
     }
 
@@ -218,24 +217,24 @@ struct Comparer {
                                                      std::is_same_v<decltype(std::declval<T>() < std::declval<T>()), bool>>>
     HALIDE_NEVER_INLINE void cmp(const T &a, const T &b) {
         if (a < b) {
-            result = LessThan;
+            result = Order::LessThan;
         } else if (b < a) {
-            result = GreaterThan;
+            result = Order::GreaterThan;
         }
     }
 
-    CmpResult compare(const IRNode *root_a, const IRNode *root_b) {
+    Order compare(const IRNode &root_a, const IRNode &root_b) {
         constexpr size_t stack_size = 64;             // 1 kb
         const IRNode *stack_storage[stack_size * 2];  // Intentionally uninitialized
 
         stack_ptr = stack_storage;
         stack_end = stack_storage + stack_size * 2;
-        result = Equal;
+        result = Order::Equal;
 
-        *stack_ptr++ = root_a;
-        *stack_ptr++ = root_b;
+        *stack_ptr++ = &root_a;
+        *stack_ptr++ = &root_b;
 
-        while (result == Equal && stack_ptr > stack_storage) {
+        while (result == Order::Equal && stack_ptr > stack_storage) {
             stack_ptr -= 2;
             next_a = stack_ptr[0];
             next_b = stack_ptr[1];
@@ -256,7 +255,7 @@ struct Comparer {
             }
 
             cmp(next_a->node_type, next_b->node_type);
-            if (result != Equal) {
+            if (result != Order::Equal) {
                 break;
             }
 
@@ -493,107 +492,59 @@ struct Comparer {
     }
 };
 
-template<bool use_cache>
-bool ir_equal(const IRHandle &a, const IRHandle &b) {
-    // Early out for the most common cases.
-    if (a.get() == b.get()) {
-        return true;
-    } else if (a.defined() != b.defined() ||
-               a.node_type() != b.node_type()) {
-        return false;
-    }
-    if (use_cache) {
-        const IRNode *cache[256] = {};
-        return Comparer<128>(cache).compare(a.get(), b.get()) == Equal;
-    } else {
-        return Comparer<0>(nullptr).compare(a.get(), b.get()) == Equal;
-    }
-}
-
-template<bool use_cache>
-bool ir_less_than(const IRHandle &a, const IRHandle &b) {
-    // Early out for the most common cases
-    if (a.get() == b.get()) {
-        return false;
-    } else if (!a.defined()) {
-        return true;
-    } else if (!b.defined()) {
-        return false;
-    }
-
-    if (use_cache) {
-        const IRNode *cache[256] = {};
-        return Comparer<128>(cache).compare(a.get(), b.get()) == LessThan;
-    } else {
-        return Comparer<0>(nullptr).compare(a.get(), b.get()) == LessThan;
-    }
-}
-
 }  // namespace
 
-bool equal(const Expr &a, const Expr &b) {
-    return ir_equal<false>(a, b);
-}
-
-bool equal(const Stmt &a, const Stmt &b) {
-    return ir_equal<false>(a, b);
-}
-
-bool graph_equal(const Expr &a, const Expr &b) {
-    return ir_equal<true>(a, b);
+bool equal_impl(const IRNode &a, const IRNode &b) {
+    return Comparer<0>(nullptr).compare(a, b) == Order::Equal;
 }
 
-bool graph_equal(const Stmt &a, const Stmt &b) {
-    return ir_equal<true>(a, b);
-}
-
-bool graph_less_than(const Expr &a, const Expr &b) {
-    return ir_less_than<true>(a, b);
-}
-
-bool graph_less_than(const Stmt &a, const Stmt &b) {
-    return ir_less_than<true>(a, b);
-}
-
-bool IRDeepCompare::operator()(const Expr &a, const Expr &b) const {
-    return ir_less_than<false>(a, b);
-}
-
-bool IRDeepCompare::operator()(const Stmt &a, const Stmt &b) const {
-    return ir_less_than<false>(a, b);
+bool graph_equal_impl(const IRNode &a, const IRNode &b) {
+    const IRNode *cache[256] = {};
+    return Comparer<128>(cache).compare(a, b) == Order::Equal;
 }
 
-bool IRGraphDeepCompare::operator()(const Expr &a, const Expr &b) const {
-    return ir_less_than<true>(a, b);
+bool less_than_impl(const IRNode &a, const IRNode &b) {
+    return Comparer<0>(nullptr).compare(a, b) == Order::LessThan;
 }
 
-bool IRGraphDeepCompare::operator()(const Stmt &a, const Stmt &b) const {
-    return ir_less_than<true>(a, b);
+bool graph_less_than_impl(const IRNode &a, const IRNode &b) {
+    const IRNode *cache[256] = {};
+    return Comparer<128>(cache).compare(a, b) == Order::LessThan;
 }
 
 // Testing code
 namespace {
 
-CmpResult flip_result(CmpResult r) {
+Order flip_result(Order r) {
     switch (r) {
-    case LessThan:
-        return GreaterThan;
-    case Equal:
-        return Equal;
-    case GreaterThan:
-        return LessThan;
-    case Unknown:
-        return Unknown;
+    case Order::Equal:
+        r = Order::Equal;
+    case Order::LessThan:
+        r = Order::GreaterThan;
+    case Order::GreaterThan:
+        r = Order::LessThan;
+    }
+    return r;
+}
+
+std::ostream &operator<<(std::ostream &s, Order o) {
+    switch (o) {
+    case Order::Equal:
+        s << "Equal";
+    case Order::LessThan:
+        s << "LessThan";
+    case Order::GreaterThan:
+        s << "GreaterThan";
     }
-    return Unknown;
+    return s;
 }
 
 void check_equal(const Expr &a, const Expr &b) {
     const IRNode *cache[256] = {};
-    CmpResult r = Comparer<128>(cache).compare(a.get(), b.get());
-    internal_assert(r == Equal)
+    Order r = Comparer<128>(cache).compare(*(a.get()), *(b.get()));
+    internal_assert(r == Order::Equal)
         << "Error in ir_equality_test: " << r
-        << " instead of " << Equal
+        << " instead of " << Order::Equal
         << " when comparing:\n"
         << a
         << "\nand\n"
@@ -602,10 +553,9 @@ void check_equal(const Expr &a, const Expr &b) {
 
 void check_not_equal(const Expr &a, const Expr &b) {
     const IRNode *cache[256] = {};
-    CmpResult r1 = Comparer<128>(cache).compare(a.get(), b.get());
-    CmpResult r2 = Comparer<128>(cache).compare(b.get(), a.get());
-    internal_assert(r1 != Equal &&
-                    r1 != Unknown &&
+    Order r1 = Comparer<128>(cache).compare(*(a.get()), *(b.get()));
+    Order r2 = Comparer<128>(cache).compare(*(b.get()), *(a.get()));
+    internal_assert(r1 != Order::Equal &&
                     flip_result(r1) == r2)
         << "Error in ir_equality_test: " << r1
         << " is not the opposite of " << r2
diff --git a/src/IREquality.h b/src/IREquality.h
index a4070f50ef96..da1ad17681e1 100644
--- a/src/IREquality.h
+++ b/src/IREquality.h
@@ -2,7 +2,11 @@
 #define HALIDE_IR_EQUALITY_H
 
 /** \file
- * Methods to test Exprs and Stmts for equality of value
+ * Methods to test Exprs and Stmts for equality of value.
+ *
+ * These methods traverse the entire IR tree. For equality of reference, use
+ * Expr::same_as. If you're comparing non-CSE'd Exprs, use graph_equal or
+ * graph_less_than, which is safe for nasty graphs of IR nodes.
  */
 
 #include "Expr.h"
@@ -10,35 +14,149 @@
 namespace Halide {
 namespace Internal {
 
-/** A compare struct suitable for use in std::map and std::set that
- * computes a lexical ordering on IR nodes. */
+// We want to inline a few quick checks into the caller. These are the actual
+// implementations that get called after those quick checks.
+bool equal_impl(const IRNode &a, const IRNode &b);
+bool graph_equal_impl(const IRNode &a, const IRNode &b);
+bool less_than_impl(const IRNode &a, const IRNode &b);
+bool graph_less_than_impl(const IRNode &a, const IRNode &b);
+
+/** Compare an Expr to an int literal. This is a somewhat common use of equal in
+ * tests. Making this separate avoids constructing an Expr out of the int
+ * literal just to check if it's equal to a. */
+HALIDE_ALWAYS_INLINE
+bool equal(const Expr &a, int b) {
+    if (const IntImm *i = a.as<IntImm>()) {
+        return (a.type() == Int(32) && i->value == b);
+    } else {
+        return false;
+    }
+}
+
+/** Check if two defined Stmts or Exprs are equal. */
+HALIDE_ALWAYS_INLINE
+bool equal(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return true;
+    } else if (a.node_type != b.node_type) {
+        return false;
+    } else {
+        return equal_impl(a, b);
+    }
+}
+
+/** Check if two possible-undefined Stmts or Exprs are equal. */
+HALIDE_ALWAYS_INLINE
+bool equal(const IRHandle &a, const IRHandle &b) {
+    if (!a.defined()) {
+        return !b.defined();
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return equal(*(a.get()), *(b.get()));
+    }
+}
+
+/** Check if two defined Stmts or Exprs are equal. Safe to call on Exprs that
+ * haven't been passed to common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_equal(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return true;
+    } else if (a.node_type != b.node_type) {
+        return false;
+    } else {
+        return equal_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are equal. Safe to call on
+ * Exprs that haven't been passed to common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_equal(const IRHandle &a, const IRHandle &b) {
+    if (!a.defined()) {
+        return !b.defined();
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return equal(*(a.get()), *(b.get()));
+    }
+}
+
+/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
+ * map keys. */
+HALIDE_ALWAYS_INLINE
+bool less_than(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return false;
+    } else if (a.node_type < b.node_type) {
+        return true;
+    } else {
+        return less_than_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
+ * order. For use in map keys. */
+HALIDE_ALWAYS_INLINE
+bool less_than(const IRHandle &a, const IRHandle &b) {
+    if (a.get() == b.get()) {
+        return false;
+    } else if (!a.defined()) {
+        return true;
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return less_than(*(a.get()), *(b.get()));
+    }
+}
+
+/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
+ * map keys. Safe to use on Exprs that haven't been passed to
+ * common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_less_than(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return false;
+    } else if (a.node_type < b.node_type) {
+        return true;
+    } else {
+        return graph_less_than_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
+ * order. For use in map keys. Safe to use on Exprs that haven't been passed to
+ * common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_less_than(const IRHandle &a, const IRHandle &b) {
+    if (a.get() == b.get()) {
+        return false;
+    } else if (!a.defined()) {
+        return true;
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return graph_less_than(*(a.get()), *(b.get()));
+    }
+}
+
+/** A compare struct built around less_than, for use as the comparison
+ * object in a std::map or std::set. */
 struct IRDeepCompare {
-    bool operator()(const Expr &a, const Expr &b) const;
-    bool operator()(const Stmt &a, const Stmt &b) const;
+    bool operator()(const IRHandle &a, const IRHandle &b) const {
+        return less_than(a, b);
+    }
 };
 
+/** A compare struct built around graph_less_than, for use as the comparison
+ * object in a std::map or std::set. */
 struct IRGraphDeepCompare {
-    bool operator()(const Expr &a, const Expr &b) const;
-    bool operator()(const Stmt &a, const Stmt &b) const;
+    bool operator()(const IRHandle &a, const IRHandle &b) const {
+        return graph_less_than(a, b);
+    }
 };
 
-/** Compare IR nodes for equality of value. Traverses entire IR
- * tree. For equality of reference, use Expr::same_as. If you're
- * comparing non-CSE'd Exprs, use graph_equal, which is safe for nasty
- * graphs of IR nodes. */
-// @{
-bool equal(const Expr &a, const Expr &b);
-bool equal(const Stmt &a, const Stmt &b);
-bool graph_equal(const Expr &a, const Expr &b);
-bool graph_equal(const Stmt &a, const Stmt &b);
-// @}
-
-/** Order unsanitized IRNodes for use in a map key */
-// @{
-bool graph_less_than(const Expr &a, const Expr &b);
-bool graph_less_than(const Stmt &a, const Stmt &b);
-// @}
-
 void ir_equality_test();
 
 }  // namespace Internal
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index 10521f82ac03..ffbb9406ad1e 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -409,149 +409,5 @@ Expr with_lanes(const Expr &x, int lanes) {
     return WithLanes(lanes).mutate(x);
 }
 
-namespace IRMatcher {
-
-HALIDE_ALWAYS_INLINE
-bool equal_helper(const Expr &a, const Expr &b) {
-    return equal(*a.get(), *b.get());
-}
-
-template<typename Op>
-HALIDE_ALWAYS_INLINE bool equal_helper_binop(const BaseExprNode &a, const BaseExprNode &b) {
-    return (equal_helper(((const Op &)a).a, ((const Op &)b).a) &&
-            equal_helper(((const Op &)a).b, ((const Op &)b).b));
-}
-
-HALIDE_ALWAYS_INLINE
-bool equal_helper(int a, int b) {
-    return a == b;
-}
-
-template<typename T>
-HALIDE_ALWAYS_INLINE bool equal_helper(const std::vector<T> &a, const std::vector<T> &b) {
-    if (a.size() != b.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!equal_helper(a[i], b[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept {
-    switch (a.node_type) {
-    case IRNodeType::IntImm:
-        return ((const IntImm &)a).value == ((const IntImm &)b).value;
-    case IRNodeType::UIntImm:
-        return ((const UIntImm &)a).value == ((const UIntImm &)b).value;
-    case IRNodeType::FloatImm:
-        return ((const FloatImm &)a).value == ((const FloatImm &)b).value;
-    case IRNodeType::StringImm:
-        return ((const StringImm &)a).value == ((const StringImm &)b).value;
-    case IRNodeType::Cast:
-        // While we know a and b have matching type, we don't know
-        // that the types of the values match, so use equal rather
-        // than equal_helper.
-        return equal(((const Cast &)a).value, ((const Cast &)b).value);
-    case IRNodeType::Reinterpret:
-        // While we know a and b have matching type, we don't know
-        // that the types of the values match, so use equal rather
-        // than equal_helper.
-        return equal(((const Reinterpret &)a).value, ((const Reinterpret &)b).value);
-    case IRNodeType::Variable:
-        return ((const Variable &)a).name == ((const Variable &)b).name;
-    case IRNodeType::Add:
-        return equal_helper_binop<Add>(a, b);
-    case IRNodeType::Sub:
-        return equal_helper_binop<Sub>(a, b);
-    case IRNodeType::Mul:
-        return equal_helper_binop<Mul>(a, b);
-    case IRNodeType::Div:
-        return equal_helper_binop<Div>(a, b);
-    case IRNodeType::Mod:
-        return equal_helper_binop<Mod>(a, b);
-    case IRNodeType::Min:
-        return equal_helper_binop<Min>(a, b);
-    case IRNodeType::Max:
-        return equal_helper_binop<Max>(a, b);
-    case IRNodeType::EQ:
-        return equal_helper_binop<EQ>(a, b);
-    case IRNodeType::NE:
-        return equal_helper_binop<NE>(a, b);
-    case IRNodeType::LT:
-        return equal_helper_binop<LT>(a, b);
-    case IRNodeType::LE:
-        return equal_helper_binop<LE>(a, b);
-    case IRNodeType::GT:
-        return equal_helper_binop<GT>(a, b);
-    case IRNodeType::GE:
-        return equal_helper_binop<GE>(a, b);
-    case IRNodeType::And:
-        return equal_helper_binop<And>(a, b);
-    case IRNodeType::Or:
-        return equal_helper_binop<Or>(a, b);
-    case IRNodeType::Not:
-        return equal_helper(((const Not &)a).a, ((const Not &)b).a);
-    case IRNodeType::Select:
-        return (equal_helper(((const Select &)a).condition, ((const Select &)b).condition) &&
-                equal_helper(((const Select &)a).true_value, ((const Select &)b).true_value) &&
-                equal_helper(((const Select &)a).false_value, ((const Select &)b).false_value));
-    case IRNodeType::Load:
-        return (((const Load &)a).name == ((const Load &)b).name &&
-                equal_helper(((const Load &)a).index, ((const Load &)b).index));
-    case IRNodeType::Ramp:
-        return (equal_helper(((const Ramp &)a).base, ((const Ramp &)b).base) &&
-                equal_helper(((const Ramp &)a).stride, ((const Ramp &)b).stride));
-    case IRNodeType::Broadcast:
-        return equal_helper(((const Broadcast &)a).value, ((const Broadcast &)b).value);
-    case IRNodeType::Call:
-        return (((const Call &)a).name == ((const Call &)b).name &&
-                ((const Call &)a).call_type == ((const Call &)b).call_type &&
-                ((const Call &)a).value_index == ((const Call &)b).value_index &&
-                equal_helper(((const Call &)a).args, ((const Call &)b).args));
-    case IRNodeType::Let:
-        return (((const Let &)a).name == ((const Let &)b).name &&
-                equal_helper(((const Let &)a).value, ((const Let &)b).value) &&
-                equal_helper(((const Let &)a).body, ((const Let &)b).body));
-    case IRNodeType::Shuffle:
-        return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) &&
-                equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices));
-    case IRNodeType::VectorReduce:
-        // As with Cast above, we use equal instead of equal_helper
-        // here, because while we know a.type == b.type, we don't know
-        // if the types of the value fields also match. We could be
-        // comparing a reduction of an 8-vector down to a 4 vector to
-        // a reduction of a 16-vector down to a 4-vector.
-        return (((const VectorReduce &)a).op == ((const VectorReduce &)b).op &&
-                equal(((const VectorReduce &)a).value, ((const VectorReduce &)b).value));
-
-    // Explicitly list all the Stmts instead of using a default
-    // clause so that if new Exprs are added without being handled
-    // here we get a compile-time error.
-    case IRNodeType::LetStmt:
-    case IRNodeType::AssertStmt:
-    case IRNodeType::ProducerConsumer:
-    case IRNodeType::For:
-    case IRNodeType::Acquire:
-    case IRNodeType::Store:
-    case IRNodeType::Provide:
-    case IRNodeType::Allocate:
-    case IRNodeType::Free:
-    case IRNodeType::Realize:
-    case IRNodeType::Block:
-    case IRNodeType::Fork:
-    case IRNodeType::IfThenElse:
-    case IRNodeType::Evaluate:
-    case IRNodeType::Prefetch:
-    case IRNodeType::Atomic:
-    case IRNodeType::HoistedStorage:
-        break;
-    }
-    return false;
-}
-
-}  // namespace IRMatcher
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/IRMatch.h b/src/IRMatch.h
index a203fec51199..4f6dfb13c145 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -188,18 +188,6 @@ Expr make_const_expr(halide_scalar_value_t val, halide_type_t ty) {
     return e;
 }
 
-bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept;
-
-// A fast version of expression equality that assumes a well-typed non-null expression tree.
-HALIDE_ALWAYS_INLINE
-bool equal(const BaseExprNode &a, const BaseExprNode &b) noexcept {
-    // Early out
-    return (&a == &b) ||
-           ((a.type == b.type) &&
-            (a.node_type == b.node_type) &&
-            equal_helper(a, b));
-}
-
 // A pattern that matches a specific expression
 struct SpecificExpr {
     struct pattern_tag {};

From 00b8126cea92e2a0568ca9ddec45cfc8105cef04 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 17 Apr 2024 16:10:47 -0700
Subject: [PATCH 04/13] Fix computational complexity of substitute_facts

It was O(n) for n facts. This makes it O(log(n))

This was particularly bad for pipelines with lots of inputs or outputs,
because those pipelines have lots of asserts, which make for lots of
facts to substitute in.

Speeds up lowering of local laplacian with 20 pyramid levels (which has
only one input and one output) by 1.09x

Speeds up lowering of the adams 2019 cost model training pipeline (lots
of weight inputs and lots outputs due to derivatives) by 1.5x

Speeds up resnet50 (tons of weight inputs) lowering by 7.3x!
---
 src/Simplify.cpp        | 42 +++++++++++++++++++++++++++++------------
 src/Simplify_Internal.h |  2 +-
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index 61cf7886cb70..bc0c0964cf81 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -179,7 +179,7 @@ void Simplify::ScopedFact::learn_false(const Expr &fact) {
         return;
     }
     if (simplify->falsehoods.insert(fact).second) {
-        falsehoods.push_back(fact);
+        falsehoods.insert(fact);
     }
 }
 
@@ -311,20 +311,38 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
         return;
     }
     if (simplify->truths.insert(fact).second) {
-        truths.push_back(fact);
+        truths.insert(fact);
     }
 }
 
-template<class T>
-T substitute_facts_impl(T t, const vector<Expr> &truths, const vector<Expr> &falsehoods) {
-    // An std::map<Expr, Expr> version of substitute might be an optimization?
-    for (const auto &i : truths) {
-        t = substitute(i, const_true(i.type().lanes()), t);
-    }
-    for (const auto &i : falsehoods) {
-        t = substitute(i, const_false(i.type().lanes()), t);
-    }
-    return t;
+template<typename T>
+T substitute_facts_impl(const T &t,
+                        const std::set<Expr, IRDeepCompare> &truths,
+                        const std::set<Expr, IRDeepCompare> &falsehoods) {
+    class Substitutor : public IRMutator {
+        const std::set<Expr, IRDeepCompare> &truths, &falsehoods;
+
+    public:
+        using IRMutator::mutate;
+        Expr mutate(const Expr &e) override {
+            if (!e.type().is_bool()) {
+                return IRMutator::mutate(e);
+            } else if (truths.count(e)) {
+                return make_one(e.type());
+            } else if (falsehoods.count(e)) {
+                return make_zero(e.type());
+            } else {
+                return IRMutator::mutate(e);
+            }
+        }
+
+        Substitutor(const std::set<Expr, IRDeepCompare> &t,
+                    const std::set<Expr, IRDeepCompare> &f)
+            : truths(t), falsehoods(f) {
+        }
+    } substitutor(truths, falsehoods);
+
+    return substitutor.mutate(t);
 }
 
 Expr Simplify::ScopedFact::substitute_facts(const Expr &e) {
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index a59a4250cf2b..92f012926091 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -258,7 +258,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
 
         std::vector<const Variable *> pop_list;
         std::vector<const Variable *> bounds_pop_list;
-        std::vector<Expr> truths, falsehoods;
+        std::set<Expr, IRDeepCompare> truths, falsehoods;
 
         void learn_false(const Expr &fact);
         void learn_true(const Expr &fact);

From d3efa14c4d9e89e73246341b35f109898c91a0e2 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 17 Apr 2024 16:15:01 -0700
Subject: [PATCH 05/13] Add missing switch breaks

---
 src/IREquality.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index a22f52de8ba0..cfaecda5d029 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -519,10 +519,13 @@ Order flip_result(Order r) {
     switch (r) {
     case Order::Equal:
         r = Order::Equal;
+        break;
     case Order::LessThan:
         r = Order::GreaterThan;
+        break;
     case Order::GreaterThan:
         r = Order::LessThan;
+        break;
     }
     return r;
 }
@@ -531,10 +534,13 @@ std::ostream &operator<<(std::ostream &s, Order o) {
     switch (o) {
     case Order::Equal:
         s << "Equal";
+        break;
     case Order::LessThan:
         s << "LessThan";
+        break;
     case Order::GreaterThan:
         s << "GreaterThan";
+        break;
     }
     return s;
 }

From 22a04bd93ca6031f8765774b9f2ce08eca6111eb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 18 Apr 2024 10:13:43 -0700
Subject: [PATCH 06/13] Add missing comments

---
 src/IREquality.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index cfaecda5d029..d6f09c9c970e 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -55,6 +55,9 @@ struct Comparer {
     }
 
     size_t hash(const IRNode *a, const IRNode *b) {
+        // A simple hash designed to get enough information into the low bits to
+        // avoid too many collisions, while being robust to weird things like
+        // having strided set of Exprs.
         uintptr_t pa = (uintptr_t)a;
         uintptr_t pb = (uintptr_t)b;
         uintptr_t h = (((pa * 17) ^ (pb * 13)) >> 4);
@@ -191,6 +194,8 @@ struct Comparer {
     void cmp(double a, double b) {
         // Floating point scalars need special handling, due to NaNs.
         if (std::isnan(a) && std::isnan(b)) {
+            // Two nans should be considered equal, so leave comparison state
+            // unchanged.
         } else if (std::isnan(a)) {
             result = Order::LessThan;
         } else if (std::isnan(b)) {

From ef4b2de8778cbeb3f68bf3148c0fbc28415f6e9a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 18 Apr 2024 10:59:21 -0700
Subject: [PATCH 07/13] Elaborate on why we treat NaNs as equal

---
 src/IREquality.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index d6f09c9c970e..bb64c1035590 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -194,8 +194,17 @@ struct Comparer {
     void cmp(double a, double b) {
         // Floating point scalars need special handling, due to NaNs.
         if (std::isnan(a) && std::isnan(b)) {
-            // Two nans should be considered equal, so leave comparison state
-            // unchanged.
+            // Under numeric rules, NaNs aren't equal, but we're not actually
+            // comparing numbers here. We are comparing IR nodes to see if
+            // they'll compile to the same thing. Two NaN FloatImms will compile
+            // to the same thing, so they should be considered equal in this
+            // context, so we leave comparison state unchanged.
+            //
+            // Note however that we consider -0 equal to 0 here, because
+            // otherwise you get tedious problems like std::nearbyint(-0.5) with
+            // round-to-nearest mode leaving it platform-dependent whether you
+            // get -0 or 0. So if we say -0 != 0, our constant folding would be
+            // platform-dependent.
         } else if (std::isnan(a)) {
             result = Order::LessThan;
         } else if (std::isnan(b)) {

From 802ca67fbe6f3c3c88a8b95a61e2f072ce8b3915 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 18 Apr 2024 12:45:46 -0700
Subject: [PATCH 08/13] Make is_single_point compare min and max by deep
 equality

Interval::is_single_point() used to only compare expressions by shallow
equality to see if they are the same Expr object.

However, bounds_of_expr_in_scope is really improved if it uses deep
equality instead, so it has a prepass that goes over the provided scope,
calls equal(min, max) on everything, and fixes up anything where deep
equality is true but shallow equality.

This prepass costs O(n) for n things in scope, regardless of how complex
the expression being analyzed is. So if you ask for the bounds of '4'
say in a context where there are lots of things in the scope, it's
absurdly slow. We were doing this! BoxTouched calls
bounds_of_expr_in_scope lots of times on small index Exprs within the
same very large scope.

It's better to just make Interval::is_single_point() check deep
equality. This speeds up local laplacian lowering by 1.1x, and resnet50
lowering by 1.5x.

There were also places where intervals that were a single point were
diverging due to carelessly written code. E.g. the interval [40*8,
40*8], where both of those 40*8s are the same Mul node, was being
simplified like this:

interval.min = simplify(interval.min);
interval.max = simplify(interval.max);

Not only does this do double the simplification work it should, but it
also caused something that was a single point to diverge into not being
a single point, because the repeated constant-folding creates a new
Expr. With the new is_single_point this matters a lot less, but even so,
I centralized simplification of intervals into a single helper that
doesn't do the pointless double-simplification for single points.

Some of these shallowly-unequal but deeply-equal Intervals were being
created in bounds inference itself after the prepass, which may have
been generating suboptimal bounds. This change should fix that in
addition to the compile-time benefits.

Also added a simplify call in SkipStages because I noticed when it
processed specializations it was creating things like (condition) ||
(!condition).
---
 src/Bounds.cpp     | 55 +++++++++++++++++++---------------------------
 src/Interval.cpp   |  4 ++--
 src/SkipStages.cpp |  4 ++--
 3 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index a8ed2deba0d2..86c5bc096cf7 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -88,6 +88,17 @@ int static_sign(const Expr &x) {
     return 0;
 }
 
+Interval simplify(Interval i) {
+    Interval result;
+    result.min = simplify(i.min);
+    if (i.is_single_point()) {
+        result.max = result.min;
+    } else {
+        result.max = simplify(i.max);
+    }
+    return result;
+}
+
 }  // anonymous namespace
 
 const FuncValueBounds &empty_func_value_bounds() {
@@ -109,8 +120,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope<Interval> &scop
 Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope) {
     Expr expr = bound_correlated_differences(simplify(remove_likelies(e)));
     Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true);
-    interval.min = simplify(interval.min);
-    interval.max = simplify(interval.max);
+    interval = simplify(interval);
 
     // Note that we can get non-const but well-defined results (e.g. signed_integer_overflow);
     // for our purposes here, treat anything non-const as no-bound.
@@ -158,16 +168,6 @@ class Bounds : public IRVisitor {
     Bounds(const Scope<Interval> *s, const FuncValueBounds &fb, bool const_bound)
         : func_bounds(fb), const_bound(const_bound) {
         scope.set_containing_scope(s);
-
-        // Find any points that are single_points but fail is_single_point due to
-        // pointer equality checks and replace with single_points.
-        for (auto item = s->cbegin(); item != s->cend(); ++item) {
-            const Interval &item_interval = item.value();
-            if (!item_interval.is_single_point() &&
-                equal(item_interval.min, item_interval.max)) {
-                scope.push(item.name(), Interval::single_point(item_interval.min));
-            }
-        }
     }
 
 #if DO_TRACK_BOUNDS_INTERVALS
@@ -325,8 +325,7 @@ class Bounds : public IRVisitor {
                 // constants, so try to make the constants first.
 
                 // First constant-fold
-                a.min = simplify(a.min);
-                a.max = simplify(a.max);
+                a = simplify(a);
 
                 // Then try to strip off junk mins and maxes.
                 bool old_constant_bound = const_bound;
@@ -355,8 +354,7 @@ class Bounds : public IRVisitor {
                 // a is bounded, but from and to can't necessarily represent
                 // each other; however, if the bounds can be simplified to
                 // constants, they might fit regardless of types.
-                a.min = simplify(a.min);
-                a.max = simplify(a.max);
+                a = simplify(a);
                 const auto *umin = as_const_uint(a.min);
                 const auto *umax = as_const_uint(a.max);
                 if (umin && umax && to.can_represent(*umin) && to.can_represent(*umax)) {
@@ -2573,13 +2571,11 @@ class BoxesTouched : public IRGraphVisitor {
             op->value.accept(this);
 
             f.value_bounds = bounds_of_expr_in_scope(op->value, scope, func_bounds);
-
-            bool fixed = f.value_bounds.min.same_as(f.value_bounds.max);
-            f.value_bounds.min = simplify(f.value_bounds.min);
-            f.value_bounds.max = fixed ? f.value_bounds.min : simplify(f.value_bounds.max);
+            f.value_bounds = simplify(f.value_bounds);
 
             if (is_small_enough_to_substitute(f.value_bounds.min) &&
-                (fixed || is_small_enough_to_substitute(f.value_bounds.max))) {
+                (f.value_bounds.is_single_point() ||
+                 is_small_enough_to_substitute(f.value_bounds.max))) {
                 scope.push(op->name, f.value_bounds);
             } else {
                 f.max_name = unique_name('t');
@@ -2769,9 +2765,7 @@ class BoxesTouched : public IRGraphVisitor {
                         const Expr *val = let_stmts.find(l.var);
                         internal_assert(val);
                         v_bound = bounds_of_expr_in_scope(*val, scope, func_bounds);
-                        bool fixed = v_bound.min.same_as(v_bound.max);
-                        v_bound.min = simplify(v_bound.min);
-                        v_bound.max = fixed ? v_bound.min : simplify(v_bound.max);
+                        v_bound = simplify(v_bound);
 
                         const Interval *old_bound = scope.find(l.var);
                         internal_assert(old_bound);
@@ -3368,12 +3362,12 @@ FuncValueBounds compute_function_value_bounds(const vector<string> &order,
                 result = compute_pure_function_definition_value_bounds(f.definition(), arg_scope, fb, j);
                 // These can expand combinatorially as we go down the
                 // pipeline if we don't run CSE on them.
+                bool fixed = result.is_single_point();
                 if (result.has_lower_bound()) {
                     result.min = simplify(common_subexpression_elimination(result.min));
                 }
-
                 if (result.has_upper_bound()) {
-                    result.max = simplify(common_subexpression_elimination(result.max));
+                    result.max = fixed ? result.min : simplify(common_subexpression_elimination(result.max));
                 }
 
                 fb[key] = result;
@@ -3431,8 +3425,7 @@ namespace {
 void check(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min, const Expr &correct_max) {
     FuncValueBounds fb;
     Interval result = bounds_of_expr_in_scope(e, scope, fb);
-    result.min = simplify(result.min);
-    result.max = simplify(result.max);
+    result = simplify(result);
     if (!equal(result.min, correct_min)) {
         internal_error << "In bounds of " << e << ":\n"
                        << "Incorrect min: " << result.min << "\n"
@@ -3448,8 +3441,7 @@ void check(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min,
 void check_constant_bound(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min, const Expr &correct_max) {
     FuncValueBounds fb;
     Interval result = bounds_of_expr_in_scope(e, scope, fb, true);
-    result.min = simplify(result.min);
-    result.max = simplify(result.max);
+    result = simplify(result);
     if (!equal(result.min, correct_min)) {
         internal_error << "In find constant bound of " << e << ":\n"
                        << "Incorrect min constant bound: " << result.min << "\n"
@@ -3603,8 +3595,7 @@ void boxes_touched_test() {
     for (size_t i = 0; i < result.size(); ++i) {
         const Interval &correct = expected[i];
         Interval b = result[i];
-        b.min = simplify(b.min);
-        b.max = simplify(b.max);
+        b = simplify(b);
         if (!equal(correct.min, b.min)) {
             internal_error << "In bounds of dim " << i << ":\n"
                            << "Incorrect min: " << b.min << "\n"
diff --git a/src/Interval.cpp b/src/Interval.cpp
index 10550f7ed48b..bdb529999bf2 100644
--- a/src/Interval.cpp
+++ b/src/Interval.cpp
@@ -86,11 +86,11 @@ bool Interval::is_everything() const {
 }
 
 bool Interval::is_single_point() const {
-    return min.same_as(max);
+    return is_bounded() && equal(min, max);
 }
 
 bool Interval::is_single_point(const Expr &e) const {
-    return min.same_as(e) && max.same_as(e);
+    return is_bounded() && equal(min, e) && equal(max, e);
 }
 
 bool Interval::has_upper_bound() const {
diff --git a/src/SkipStages.cpp b/src/SkipStages.cpp
index caf292972fbb..3badfc5d77d9 100644
--- a/src/SkipStages.cpp
+++ b/src/SkipStages.cpp
@@ -269,8 +269,8 @@ class SkipStages : public IRMutator {
 
     Stmt emit_defs(Stmt stmt) {
         for (auto &p : func_info) {
-            stmt = LetStmt::make(used_var_name(p.first), p.second.used, stmt);
-            stmt = LetStmt::make(loaded_var_name(p.first), p.second.loaded, stmt);
+            stmt = LetStmt::make(used_var_name(p.first), simplify(p.second.used), stmt);
+            stmt = LetStmt::make(loaded_var_name(p.first), simplify(p.second.loaded), stmt);
             need_uniquify |= !lets_emitted.insert(p.first).second;
         }
         return stmt;

From b15a648ef7e91ad70f38796d783275f07bed0f11 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 18 Apr 2024 15:39:07 -0700
Subject: [PATCH 09/13] clang-tidy

---
 src/Bounds.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 86c5bc096cf7..847f6b73842a 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -88,7 +88,7 @@ int static_sign(const Expr &x) {
     return 0;
 }
 
-Interval simplify(Interval i) {
+Interval simplify(const Interval &i) {
     Interval result;
     result.min = simplify(i.min);
     if (i.is_single_point()) {

From 4619886992fed9211d89e2726775a7216abbecf9 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 19 Apr 2024 16:12:59 -0700
Subject: [PATCH 10/13] Speed up the vars_used visitor in the simplifier let
 visitor

This visitor shows up as the main cost of lowering in very large
pipelines.

This visitor is for tracking which lets are actually used for real
inside the body of a let block (as opposed to the tracking we do when
mutating, which is approximate, because we could construct and Expr that
uses a Var and then discard it in a later mutation).

The old implementation made a map of all variables referenced, and then
checked each let name against that map one by one. If there are a small
number of lets outside a huge Stmt, this is bad, because the data
structure has to hold a number of names proportional to the stmt size
instead of proportional to the number of lets.

This new implementation instead makes a hash set of the let names, and
than traverses the Stmt, removing names from the set as they are
encountered. This is a big speed-up.

We then make the speed-up larger by about the same factor again doing
the following:

1) Only add names to the map that might be used based on the recursive
mutate call. These are very very likely to be used, because we saw them
at least once, and mutations that remove *all* uses of a Var are rare.

2) The visitor should early out when the map becomes empty. The let
variables are often all used immediately, so this is frequent.

Speeds up lowering of local laplacian by 1.44x, 2.6x, and 4.8x
respectively for 20, 50, and 100 pyramid levels.

Speeds up lowering of resnet50 by 1.04x. Speeds up lowering of lens blur
by 1.06x
---
 src/Simplify_Let.cpp | 73 +++++++++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 4f1862abf6ac..7feeb0abc795 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -1,6 +1,8 @@
 #include "Simplify_Internal.h"
 #include "Substitute.h"
 
+#include <unordered_set>
+
 namespace Halide {
 namespace Internal {
 
@@ -9,34 +11,50 @@ using std::vector;
 
 namespace {
 
-class CountVarUses : public IRVisitor {
-    std::map<std::string, int> &var_uses;
+class FindVarUses : public IRVisitor {
+    std::unordered_set<std::string> &unused_vars;
 
     void visit(const Variable *var) override {
-        var_uses[var->name]++;
+        unused_vars.erase(var->name);
     }
 
     void visit(const Load *op) override {
-        var_uses[op->name]++;
-        IRVisitor::visit(op);
+        if (!unused_vars.empty()) {
+            unused_vars.erase(op->name);
+            IRVisitor::visit(op);
+        }
     }
 
     void visit(const Store *op) override {
-        var_uses[op->name]++;
-        IRVisitor::visit(op);
+        if (!unused_vars.empty()) {
+            unused_vars.erase(op->name);
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const Block *op) override {
+        // Early out at Block nodes if we've already seen every name we're
+        // interested in. In principal we could early-out at every node, but
+        // blocks, loads, and stores seem to be enough.
+        if (!unused_vars.empty()) {
+            op->first.accept(this);
+            if (!unused_vars.empty()) {
+                op->rest.accept(this);
+            }
+        }
     }
 
     using IRVisitor::visit;
 
 public:
-    CountVarUses(std::map<std::string, int> &var_uses)
-        : var_uses(var_uses) {
+    FindVarUses(std::unordered_set<std::string> &unused_vars)
+        : unused_vars(unused_vars) {
     }
 };
 
 template<typename StmtOrExpr>
-void count_var_uses(StmtOrExpr x, std::map<std::string, int> &var_uses) {
-    CountVarUses counter(var_uses);
+void find_var_uses(StmtOrExpr x, std::unordered_set<std::string> &unused_vars) {
+    FindVarUses counter(unused_vars);
     x.accept(&counter);
 }
 
@@ -53,6 +71,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         string new_name;
         bool new_value_alignment_tracked = false, new_value_bounds_tracked = false;
         bool value_alignment_tracked = false, value_bounds_tracked = false;
+        VarInfo info;
         Frame(const LetOrLetStmt *op)
             : op(op) {
         }
@@ -226,14 +245,27 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
 
     result = mutate_let_body(result, bounds);
 
-    // TODO: var_info and vars_used are pretty redundant; however, at the time
+    // TODO: var_info and unused_vars are pretty redundant; however, at the time
     // of writing, both cover cases that the other does not:
     // - var_info prevents duplicate lets from being generated, even
     //   from different Frame objects.
-    // - vars_used avoids dead lets being generated in cases where vars are
+    // - unused_vars avoids dead lets being generated in cases where vars are
     //   seen as used by var_info, and then later removed.
-    std::map<std::string, int> vars_used;
-    count_var_uses(result, vars_used);
+
+    std::unordered_set<std::string> unused_vars(frames.size() * 2);
+    // Insert everything we think *might* be used, and then visit the body,
+    // removing things from the set as we find uses of them.
+    for (auto &f : frames) {
+        f.info = var_info.get(f.op->name);
+        var_info.pop(f.op->name);
+        if (f.info.old_uses) {
+            unused_vars.insert(f.op->name);
+        }
+        if (f.info.new_uses) {
+            unused_vars.insert(f.new_name);
+        }
+    }
+    find_var_uses(result, unused_vars);
 
     for (auto it = frames.rbegin(); it != frames.rend(); it++) {
         if (it->value_bounds_tracked) {
@@ -243,20 +275,17 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
             bounds_and_alignment_info.pop(it->new_name);
         }
 
-        VarInfo info = var_info.get(it->op->name);
-        var_info.pop(it->op->name);
-
-        if (it->new_value.defined() && (info.new_uses > 0 && vars_used.count(it->new_name) > 0)) {
+        if (it->new_value.defined() && (it->info.new_uses > 0 && !unused_vars.count(it->new_name))) {
             // The new name/value may be used
             result = LetOrLetStmt::make(it->new_name, it->new_value, result);
-            count_var_uses(it->new_value, vars_used);
+            find_var_uses(it->new_value, unused_vars);
         }
 
         if ((!remove_dead_code && std::is_same<LetOrLetStmt, LetStmt>::value) ||
-            (info.old_uses > 0 && vars_used.count(it->op->name) > 0)) {
+            (it->info.old_uses > 0 && !unused_vars.count(it->op->name))) {
             // The old name is still in use. We'd better keep it as well.
             result = LetOrLetStmt::make(it->op->name, it->value, result);
-            count_var_uses(it->value, vars_used);
+            find_var_uses(it->value, unused_vars);
         }
 
         const LetOrLetStmt *new_op = result.template as<LetOrLetStmt>();

From 404622eca5aeeebe639307350d9710f82361ff4d Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 23 Apr 2024 12:55:50 -0700
Subject: [PATCH 11/13] Exploit the ref count of the replacement Expr

---
 src/IntrusivePtr.h   | 8 ++++++++
 src/Simplify_Let.cpp | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/IntrusivePtr.h b/src/IntrusivePtr.h
index f233420c8009..d265c3dcec8a 100644
--- a/src/IntrusivePtr.h
+++ b/src/IntrusivePtr.h
@@ -32,6 +32,9 @@ class RefCount {
     bool is_const_zero() const {
         return count == 0;
     }
+    int atomic_get() const {
+        return count;
+    }
 };
 
 /**
@@ -173,6 +176,11 @@ struct IntrusivePtr {
     bool operator<(const IntrusivePtr<T> &other) const {
         return ptr < other.ptr;
     }
+
+    HALIDE_ALWAYS_INLINE
+    bool is_sole_reference() const {
+        return ptr && ref_count(ptr).atomic_get() == 1;
+    }
 };
 
 }  // namespace Internal
diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 7feeb0abc795..8e778bea31eb 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -261,7 +261,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         if (f.info.old_uses) {
             unused_vars.insert(f.op->name);
         }
-        if (f.info.new_uses) {
+        if (f.info.new_uses && f.new_value.defined() && !f.info.replacement.is_sole_reference()) {
             unused_vars.insert(f.new_name);
         }
     }

From b5db2191f1c227eb35e6d41b584ac15864f37b27 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 24 Apr 2024 10:05:29 -0700
Subject: [PATCH 12/13] Fix is_sole_reference logic in Simplify_Let.cpp

---
 src/Simplify_Let.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 8e778bea31eb..7a6f543947ff 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -67,7 +67,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
     // the call stack where it could overflow onto an explicit stack.
     struct Frame {
         const LetOrLetStmt *op;
-        Expr value, new_value;
+        Expr value, new_value, new_var;
         string new_name;
         bool new_value_alignment_tracked = false, new_value_bounds_tracked = false;
         bool value_alignment_tracked = false, value_bounds_tracked = false;
@@ -208,6 +208,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
             // Nothing to substitute
             f.new_value = Expr();
             replacement = Expr();
+            new_var = Expr();
         } else {
             debug(4) << "new let " << f.new_name << " = " << f.new_value << " in ... " << replacement << " ...\n";
         }
@@ -216,6 +217,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         info.old_uses = 0;
         info.new_uses = 0;
         info.replacement = replacement;
+        f.new_var = new_var;
 
         var_info.push(op->name, info);
 
@@ -257,11 +259,19 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
     // removing things from the set as we find uses of them.
     for (auto &f : frames) {
         f.info = var_info.get(f.op->name);
+        // Drop any reference to new_var held by the replacement expression so
+        // that the only references are either f.new_var, or ones in the body or
+        // new_values of other lets.
+        f.info.replacement = Expr();
+        if (f.new_var.is_sole_reference()) {
+            // Any new_uses must have been eliminated by later mutations.
+            f.info.new_uses = 0;
+        }
         var_info.pop(f.op->name);
         if (f.info.old_uses) {
             unused_vars.insert(f.op->name);
         }
-        if (f.info.new_uses && f.new_value.defined() && !f.info.replacement.is_sole_reference()) {
+        if (f.info.new_uses && f.new_value.defined()) {
             unused_vars.insert(f.new_name);
         }
     }

From 5130c4c8f6bb563085d8f3632e4b5d6d4e16d54d Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 24 Apr 2024 15:12:01 -0700
Subject: [PATCH 13/13] Reduce hash map size

---
 src/Simplify_Let.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 7a6f543947ff..342281fa6639 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -254,7 +254,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
     // - unused_vars avoids dead lets being generated in cases where vars are
     //   seen as used by var_info, and then later removed.
 
-    std::unordered_set<std::string> unused_vars(frames.size() * 2);
+    std::unordered_set<std::string> unused_vars(frames.size());
     // Insert everything we think *might* be used, and then visit the body,
     // removing things from the set as we find uses of them.
     for (auto &f : frames) {
@@ -269,9 +269,9 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         }
         var_info.pop(f.op->name);
         if (f.info.old_uses) {
+            internal_assert(f.info.new_uses == 0);
             unused_vars.insert(f.op->name);
-        }
-        if (f.info.new_uses && f.new_value.defined()) {
+        } else if (f.info.new_uses && f.new_value.defined()) {
             unused_vars.insert(f.new_name);
         }
     }