halide · abadams · Nov 24, 2025 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/python_bindings/src/halide/halide_/PyCallable.cpp b/python_bindings/src/halide/halide_/PyCallable.cpp
@@ -109,13 +109,11 @@ class PyCallable {
             } else {
                 argv[slot] = &scalar_storage[slot];
 
-                // clang-format off
-
-                #define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE, FIELD)                \
-                    case halide_type_t(CODE, BITS).as_u32():                                \
-                        scalar_storage[slot].u.FIELD = cast_to<TYPE>(value);                \
-                        cci[slot] = Callable::make_scalar_qcci(halide_type_t(CODE, BITS));   \
-                        break;
+#define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE, FIELD)               \
+    case halide_type_t(CODE, BITS).as_u32():                               \
+        scalar_storage[slot].u.FIELD = cast_to<TYPE>(value);               \
+        cci[slot] = Callable::make_scalar_qcci(halide_type_t(CODE, BITS)); \
+        break;
 
                 switch (((halide_type_t)c_arg.type).element_of().as_u32()) {
                     HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 32, float, f32)
@@ -134,9 +132,7 @@ class PyCallable {
                     _halide_user_assert(0) << "Unsupported type in Callable argument list: " << c_arg.type << "\n";
                 }
 
-                #undef HALIDE_HANDLE_TYPE_DISPATCH
-
-                // clang-format on
+#undef HALIDE_HANDLE_TYPE_DISPATCH
             }
         };
 

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -363,7 +363,6 @@ CodeGen_ARM::CodeGen_ARM(const Target &target)
     negations.emplace_back("saturating_negate", -max(wild_i8x_, -127));
     negations.emplace_back("saturating_negate", -max(wild_i16x_, -32767));
     negations.emplace_back("saturating_negate", -max(wild_i32x_, -(0x7fffffff)));
-    // clang-format on
 }
 
 constexpr int max_intrinsic_args = 4;
@@ -393,7 +392,6 @@ struct ArmIntrinsic {
     };
 };
 
-// clang-format off
 const ArmIntrinsic intrinsic_defs[] = {
     // TODO(https://github.com/halide/Halide/issues/8093):
     // Some of the Arm intrinsic have the same name between Neon and SVE2 but with different behavior. For example,
@@ -406,7 +404,7 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
     {"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
     {"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
-    {"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)},  ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
     {"llvm.fabs.f16", "llvm.fabs.f16", Float(16), "abs", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
     {"llvm.fabs.f32", "llvm.fabs.f32", Float(32), "abs", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
     {"llvm.fabs.f64", "llvm.fabs.f64", Float(64), "abs", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
@@ -870,7 +868,6 @@ const std::map<string, string> float16_transcendental_remapping = {
     {"tan_f16", "tan_f32"},
     {"tanh_f16", "tanh_f32"},
 };
-// clang-format on
 
 llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors,
                                                    VectorTypeConstraint constraint) {
@@ -2170,7 +2167,7 @@ bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, cons
         Target::Feature required_feature;
         std::vector<int> extra_operands;
     };
-    // clang-format off
+
     static const Pattern patterns[] = {
         {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd},
         {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
@@ -2193,7 +2190,6 @@ bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, cons
         {VectorReduce::Add, 4, i64(wild_u16x_), "dot_product", Target::SVE2, {1}},
         {VectorReduce::Add, 4, u64(wild_u16x_), "dot_product", Target::SVE2, {1}},
     };
-    // clang-format on
 
     int factor = op->value.type().lanes() / op->type.lanes();
     vector<Expr> matches;

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -581,7 +581,6 @@ halide_type_t u8v2 = u8v1.with_lanes(u8v1.lanes * 2);
 halide_type_t u16v2 = u16v1.with_lanes(u16v1.lanes * 2);
 halide_type_t u32v2 = u32v1.with_lanes(u32v1.lanes * 2);
 
-// clang-format off
 #define INTRINSIC_128B(id) llvm::Intrinsic::hexagon_V6_##id##_128B
 const HvxIntrinsic intrinsic_wrappers[] = {
     // Zero/sign extension:
@@ -689,7 +688,7 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     {INTRINSIC_128B(vavghrnd), i16v1, "avg_rnd.vh.vh", {i16v1, i16v1}},
     {INTRINSIC_128B(vavgwrnd), i32v1, "avg_rnd.vw.vw", {i32v1, i32v1}},
 
-     // This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages.
+    // This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages.
     {INTRINSIC_128B(vnavgub), i8v1, "navg.vub.vub", {u8v1, u8v1}},
     {INTRINSIC_128B(vnavgb), i8v1, "navg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater},
     {INTRINSIC_128B(vnavgh), i16v1, "navg.vh.vh", {i16v1, i16v1}},
@@ -841,7 +840,6 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     {INTRINSIC_128B(vnormamth), u16v1, "cls.vh", {u16v1}},
     {INTRINSIC_128B(vnormamtw), u32v1, "cls.vw", {u32v1}},
 };
-// clang-format on
 
 // TODO: Many variants of the above functions are missing. They
 // need to be implemented in the runtime module, or via

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -465,7 +465,6 @@ void CodeGen_PTX_Dev::codegen_vector_reduce(const VectorReduce *op, const Expr &
     // TODO: Support rewriting to arbitrary calls in IRMatch and use that instead
     // of expr_match here. That would probably allow avoiding the redundant swapping
     // operands logic.
-    // clang-format off
     static const Pattern patterns[] = {
         {VectorReduce::Add, 4, i32(widening_mul(wild_i8x, wild_i8x)), "dp4a"},
         {VectorReduce::Add, 4, i32(widening_mul(wild_i8x, wild_u8x)), "dp4a"},
@@ -480,7 +479,6 @@ void CodeGen_PTX_Dev::codegen_vector_reduce(const VectorReduce *op, const Expr &
         {VectorReduce::Add, 4, widening_mul(wild_i16x, wild_u16x), "dp2a", Pattern::SwapOps | Pattern::NarrowOp1},
         {VectorReduce::Add, 4, widening_mul(wild_u16x, wild_u16x), "dp2a", Pattern::SwapOps | Pattern::NarrowOp1},
     };
-    // clang-format on
 
     const int input_lanes = op->value.type().lanes();
     const int factor = input_lanes / op->type.lanes();

diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
@@ -51,7 +51,6 @@ struct PowerPCIntrinsic {
     Target::Feature feature = Target::FeatureEnd;
 };
 
-// clang-format off
 const PowerPCIntrinsic intrinsic_defs[] = {
     {"llvm.ppc.altivec.vminsb", Int(8, 16), "min", {Int(8, 16), Int(8, 16)}},
     {"llvm.ppc.altivec.vminub", UInt(8, 16), "min", {UInt(8, 16), UInt(8, 16)}},
@@ -96,7 +95,6 @@ const PowerPCIntrinsic intrinsic_defs[] = {
     {"llvm.ppc.altivec.vavgsw", Int(32, 4), "rounding_halving_add", {Int(32, 4), Int(32, 4)}},
     {"llvm.ppc.altivec.vavguw", UInt(32, 4), "rounding_halving_add", {UInt(32, 4), UInt(32, 4)}},
 };
-// clang-format on
 
 void CodeGen_PowerPC::init_module() {
     CodeGen_Posix::init_module();

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
@@ -58,7 +58,6 @@ struct WasmIntrinsic {
     Target::Feature feature = Target::FeatureEnd;
 };
 
-// clang-format off
 const WasmIntrinsic intrinsic_defs[] = {
     {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
     {"llvm.uadd.sat.v8i16", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
@@ -111,7 +110,6 @@ const WasmIntrinsic intrinsic_defs[] = {
     {"llvm.nearbyint.f32", Float(32), "nearbyint", {Float(32)}},
     {"llvm.nearbyint.f64", Float(64), "nearbyint", {Float(64)}},
 };
-// clang-format on
 
 void CodeGen_WebAssembly::init_module() {
     CodeGen_Posix::init_module();
@@ -144,7 +142,6 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
         Target::Feature required_feature;
     };
 
-    // clang-format off
     static const Pattern patterns[] = {
         {"int_to_double", f64(wild_i32x_), Target::WasmSimd128},
         {"int_to_double", f64(wild_u32x_), Target::WasmSimd128},
@@ -155,7 +152,6 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
         {"widen_integer", i64(wild_i32x_), Target::WasmSimd128},
         {"widen_integer", u64(wild_u32x_), Target::WasmSimd128},
     };
-    // clang-format on
 
     if (op->type.is_vector()) {
         std::vector<Expr> matches;
@@ -193,7 +189,6 @@ void CodeGen_WebAssembly::visit(const Call *op) {
         Target::Feature required_feature;
     };
 
-    // clang-format off
     static const Pattern patterns[] = {
         {"q15mulr_sat_s", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15), Target::WasmSimd128},
         {"saturating_narrow", i8_sat(wild_i16x_), Target::WasmSimd128},
@@ -213,7 +208,6 @@ void CodeGen_WebAssembly::visit(const Call *op) {
         {u8_sat(wild_i32x_), u8_sat(i16_sat(wild_i32x_))},
         {i8_sat(wild_i32x_), i8_sat(i16_sat(wild_i32x_))},
     };
-    // clang-format on
 
     if (op->type.is_vector()) {
         std::vector<Expr> matches;
@@ -287,7 +281,7 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
         const char *intrin;
         Target::Feature required_feature;
     };
-    // clang-format off
+
     static const Pattern patterns[] = {
         {VectorReduce::Add, 2, i16(wild_i8x_), "pairwise_widening_add", Target::WasmSimd128},
         {VectorReduce::Add, 2, u16(wild_u8x_), "pairwise_widening_add", Target::WasmSimd128},
@@ -299,7 +293,6 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
 
         {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Target::WasmSimd128},
     };
-    // clang-format on
 
     // Other values will be added soon, so this switch isn't actually pointless
     using ValuePtr = llvm::Value *;

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -133,7 +133,6 @@ struct x86Intrinsic {
     };
 };
 
-// clang-format off
 const x86Intrinsic intrinsic_defs[] = {
     // AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers
     // just call `llvm.abs` (which requires a second argument).
@@ -295,17 +294,16 @@ const x86Intrinsic intrinsic_defs[] = {
     {"tileloadd64_i8", Int(8, 1024), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
     {"tileloadd64_i8", UInt(8, 1024), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
     {"tileloadd64_bf16", BFloat(16, 512), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
-    {"tdpbssd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), Int(8, 1024)},  Target::AVX512_SapphireRapids},
+    {"tdpbssd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), Int(8, 1024)}, Target::AVX512_SapphireRapids},
     {"tdpbsud", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), UInt(8, 1024)}, Target::AVX512_SapphireRapids},
     {"tdpbusd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), UInt(8, 1024), Int(8, 1024)}, Target::AVX512_SapphireRapids},
     {"tdpbuud", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), UInt(8, 1024), UInt(8, 1024)}, Target::AVX512_SapphireRapids},
     {"tdpbf16ps", Float(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Float(32, 256), BFloat(16, 512), BFloat(16, 512)}, Target::AVX512_SapphireRapids},
-    {"tilezero_i32", Int(32, 256), "tile_zero", {Int(16), Int(16)},  Target::AVX512_SapphireRapids},
+    {"tilezero_i32", Int(32, 256), "tile_zero", {Int(16), Int(16)}, Target::AVX512_SapphireRapids},
     {"tilezero_f32", Float(32, 256), "tile_zero", {Int(16), Int(16)}, Target::AVX512_SapphireRapids},
     {"tilestored64_i32", Int(32), "tile_store", {Int(16), Int(16), Handle(), Int(64), Int(64), Int(32, 256)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
     {"tilestored64_f32", Int(32), "tile_store", {Int(16), Int(16), Handle(), Int(64), Int(64), Float(32, 256)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
 };
-// clang-format on
 
 void CodeGen_X86::init_module() {
     CodeGen_Posix::init_module();
@@ -549,15 +547,13 @@ void CodeGen_X86::visit(const Cast *op) {
         Expr pattern;
     };
 
-    // clang-format off
     static Pattern patterns[] = {
         // This isn't rounding_mul_shift_right(i16, i16, 15) because it doesn't
         // saturate the result.
         {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},
 
         {"f32_to_bf16", bf16(wild_f32x_)},
     };
-    // clang-format on
 
     vector<Expr> matches;
     for (const Pattern &p : patterns) {
@@ -783,7 +779,7 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
             SingleArg = 1 << 2,
         };
     };
-    // clang-format off
+
     // These patterns are roughly sorted "best to worst", in case there are two
     // patterns that match the expression.
     static const Pattern patterns[] = {
@@ -819,7 +815,6 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
         {VectorReduce::Add, 8, u64(absd(wild_u8x_, wild_u8x_)), "sum_of_absolute_differences", {}},
 
     };
-    // clang-format on
 
     std::vector<Expr> matches;
     for (const Pattern &p : patterns) {
@@ -1151,11 +1146,11 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const {
     // type if we can.
     int vec_bits = t.lanes() * t.bits();
     int natural_vec_bits = target.natural_vector_size(t) * t.bits();
-    // clang-format off
+
     int slice_bits = ((vec_bits > 256 && natural_vec_bits > 256) ? 512 :
                       (vec_bits > 128 && natural_vec_bits > 128) ? 256 :
                                                                    128);
-    // clang-format on
+
     return slice_bits / t.bits();
 }
 

diff --git a/src/Debug.h b/src/Debug.h
@@ -48,11 +48,10 @@ bool debug_is_active_impl(int verbosity, const char *file, const char *function,
  * is determined by the value of the environment variable
  * HL_DEBUG_CODEGEN
  */
-// clang-format off
+
 #define debug(n)                                     \
     /* NOLINTNEXTLINE(bugprone-macro-parentheses) */ \
     if (debug_is_active((n))) std::cerr
-// clang-format on
 
 /** Allow easily printing the contents of containers, or std::vector-like containers,
  *  in debug output. Used like so:

diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
@@ -734,7 +734,7 @@ class FindIntrinsics : public IRMutator {
             // We can't do everything we want here with rewrite rules alone. So, we rewrite them
             // to rounding_shifts with the widening still in place, and narrow it after the rewrite
             // succeeds.
-            // clang-format off
+
             if (rewrite(max(min(rounding_shift_right(x, y), upper), lower), rounding_shift_right(x, y), is_x_wide_int_or_uint) ||
                 rewrite(rounding_shift_right(x, y), rounding_shift_right(x, y), is_x_wide_int_or_uint) ||
                 rewrite(rounding_shift_left(x, y), rounding_shift_left(x, y), is_x_wide_int_or_uint) ||
@@ -759,7 +759,6 @@ class FindIntrinsics : public IRMutator {
                     }
                 }
             }
-            // clang-format on
         }
 
         if (value.same_as(op->value)) {
@@ -892,7 +891,7 @@ class FindIntrinsics : public IRMutator {
         }
 
         if (no_overflow(op->type)) {
-            // clang-format off
+
             if (rewrite(halving_add(x + y, 1), rounding_halving_add(x, y)) ||
                 rewrite(halving_add(x, y + 1), rounding_halving_add(x, y)) ||
                 rewrite(halving_add(x + 1, y), rounding_halving_add(x, y)) ||
@@ -903,7 +902,6 @@ class FindIntrinsics : public IRMutator {
                 false) {
                 return mutate(rewrite.result);
             }
-            // clang-format on
         }
 
         // Move widening casts inside widening arithmetic outside the arithmetic,

diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
@@ -1184,7 +1184,7 @@ class VectorReducePatterns : public IRMutator {
         // Map of instruction signatures
         static const vector<Signature> sigs = ([&]() HALIDE_NEVER_INLINE {
             return vector<Signature>{
-                // clang-format off
+
                 // --------- vrmpy ---------
                 // Sliding window
                 {4, 32, widening_mul(wild_u8x, wild_u8x), Signature::SlidingWindow | Signature::ScalarB},
@@ -1239,7 +1239,6 @@ class VectorReducePatterns : public IRMutator {
                 {2, 16, wild_u8x},
                 {2, 32, wild_i16x},
             };
-            // clang-format on
         })();
 
         std::vector<Expr> matches;