Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions python_bindings/src/halide/halide_/PyCallable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,11 @@ class PyCallable {
} else {
argv[slot] = &scalar_storage[slot];

// clang-format off

#define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE, FIELD) \
case halide_type_t(CODE, BITS).as_u32(): \
scalar_storage[slot].u.FIELD = cast_to<TYPE>(value); \
cci[slot] = Callable::make_scalar_qcci(halide_type_t(CODE, BITS)); \
break;
#define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE, FIELD) \
case halide_type_t(CODE, BITS).as_u32(): \
scalar_storage[slot].u.FIELD = cast_to<TYPE>(value); \
cci[slot] = Callable::make_scalar_qcci(halide_type_t(CODE, BITS)); \
break;

switch (((halide_type_t)c_arg.type).element_of().as_u32()) {
HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 32, float, f32)
Expand All @@ -134,9 +132,7 @@ class PyCallable {
_halide_user_assert(0) << "Unsupported type in Callable argument list: " << c_arg.type << "\n";
}

#undef HALIDE_HANDLE_TYPE_DISPATCH

// clang-format on
#undef HALIDE_HANDLE_TYPE_DISPATCH
}
};

Expand Down
8 changes: 2 additions & 6 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,6 @@ CodeGen_ARM::CodeGen_ARM(const Target &target)
negations.emplace_back("saturating_negate", -max(wild_i8x_, -127));
negations.emplace_back("saturating_negate", -max(wild_i16x_, -32767));
negations.emplace_back("saturating_negate", -max(wild_i32x_, -(0x7fffffff)));
// clang-format on
}

constexpr int max_intrinsic_args = 4;
Expand Down Expand Up @@ -393,7 +392,6 @@ struct ArmIntrinsic {
};
};

// clang-format off
const ArmIntrinsic intrinsic_defs[] = {
// TODO(https://github.com/halide/Halide/issues/8093):
// Some of the Arm intrinsic have the same name between Neon and SVE2 but with different behavior. For example,
Expand All @@ -406,7 +404,7 @@ const ArmIntrinsic intrinsic_defs[] = {
{"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
{"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
{"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
{"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
{"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
{"llvm.fabs.f16", "llvm.fabs.f16", Float(16), "abs", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
{"llvm.fabs.f32", "llvm.fabs.f32", Float(32), "abs", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
{"llvm.fabs.f64", "llvm.fabs.f64", Float(64), "abs", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
Expand Down Expand Up @@ -870,7 +868,6 @@ const std::map<string, string> float16_transcendental_remapping = {
{"tan_f16", "tan_f32"},
{"tanh_f16", "tanh_f32"},
};
// clang-format on

llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors,
VectorTypeConstraint constraint) {
Expand Down Expand Up @@ -2170,7 +2167,7 @@ bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, cons
Target::Feature required_feature;
std::vector<int> extra_operands;
};
// clang-format off

static const Pattern patterns[] = {
{VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd},
{VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
Expand All @@ -2193,7 +2190,6 @@ bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, cons
{VectorReduce::Add, 4, i64(wild_u16x_), "dot_product", Target::SVE2, {1}},
{VectorReduce::Add, 4, u64(wild_u16x_), "dot_product", Target::SVE2, {1}},
};
// clang-format on

int factor = op->value.type().lanes() / op->type.lanes();
vector<Expr> matches;
Expand Down
4 changes: 1 addition & 3 deletions src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,6 @@ halide_type_t u8v2 = u8v1.with_lanes(u8v1.lanes * 2);
halide_type_t u16v2 = u16v1.with_lanes(u16v1.lanes * 2);
halide_type_t u32v2 = u32v1.with_lanes(u32v1.lanes * 2);

// clang-format off
#define INTRINSIC_128B(id) llvm::Intrinsic::hexagon_V6_##id##_128B
const HvxIntrinsic intrinsic_wrappers[] = {
// Zero/sign extension:
Expand Down Expand Up @@ -689,7 +688,7 @@ const HvxIntrinsic intrinsic_wrappers[] = {
{INTRINSIC_128B(vavghrnd), i16v1, "avg_rnd.vh.vh", {i16v1, i16v1}},
{INTRINSIC_128B(vavgwrnd), i32v1, "avg_rnd.vw.vw", {i32v1, i32v1}},

// This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages.
// This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages.
{INTRINSIC_128B(vnavgub), i8v1, "navg.vub.vub", {u8v1, u8v1}},
{INTRINSIC_128B(vnavgb), i8v1, "navg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater},
{INTRINSIC_128B(vnavgh), i16v1, "navg.vh.vh", {i16v1, i16v1}},
Expand Down Expand Up @@ -841,7 +840,6 @@ const HvxIntrinsic intrinsic_wrappers[] = {
{INTRINSIC_128B(vnormamth), u16v1, "cls.vh", {u16v1}},
{INTRINSIC_128B(vnormamtw), u32v1, "cls.vw", {u32v1}},
};
// clang-format on

// TODO: Many variants of the above functions are missing. They
// need to be implemented in the runtime module, or via
Expand Down
2 changes: 0 additions & 2 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,6 @@ void CodeGen_PTX_Dev::codegen_vector_reduce(const VectorReduce *op, const Expr &
// TODO: Support rewriting to arbitrary calls in IRMatch and use that instead
// of expr_match here. That would probably allow avoiding the redundant swapping
// operands logic.
// clang-format off
static const Pattern patterns[] = {
{VectorReduce::Add, 4, i32(widening_mul(wild_i8x, wild_i8x)), "dp4a"},
{VectorReduce::Add, 4, i32(widening_mul(wild_i8x, wild_u8x)), "dp4a"},
Expand All @@ -480,7 +479,6 @@ void CodeGen_PTX_Dev::codegen_vector_reduce(const VectorReduce *op, const Expr &
{VectorReduce::Add, 4, widening_mul(wild_i16x, wild_u16x), "dp2a", Pattern::SwapOps | Pattern::NarrowOp1},
{VectorReduce::Add, 4, widening_mul(wild_u16x, wild_u16x), "dp2a", Pattern::SwapOps | Pattern::NarrowOp1},
};
// clang-format on

const int input_lanes = op->value.type().lanes();
const int factor = input_lanes / op->type.lanes();
Expand Down
2 changes: 0 additions & 2 deletions src/CodeGen_PowerPC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ struct PowerPCIntrinsic {
Target::Feature feature = Target::FeatureEnd;
};

// clang-format off
const PowerPCIntrinsic intrinsic_defs[] = {
{"llvm.ppc.altivec.vminsb", Int(8, 16), "min", {Int(8, 16), Int(8, 16)}},
{"llvm.ppc.altivec.vminub", UInt(8, 16), "min", {UInt(8, 16), UInt(8, 16)}},
Expand Down Expand Up @@ -96,7 +95,6 @@ const PowerPCIntrinsic intrinsic_defs[] = {
{"llvm.ppc.altivec.vavgsw", Int(32, 4), "rounding_halving_add", {Int(32, 4), Int(32, 4)}},
{"llvm.ppc.altivec.vavguw", UInt(32, 4), "rounding_halving_add", {UInt(32, 4), UInt(32, 4)}},
};
// clang-format on

void CodeGen_PowerPC::init_module() {
CodeGen_Posix::init_module();
Expand Down
9 changes: 1 addition & 8 deletions src/CodeGen_WebAssembly.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ struct WasmIntrinsic {
Target::Feature feature = Target::FeatureEnd;
};

// clang-format off
const WasmIntrinsic intrinsic_defs[] = {
{"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
{"llvm.uadd.sat.v8i16", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
Expand Down Expand Up @@ -111,7 +110,6 @@ const WasmIntrinsic intrinsic_defs[] = {
{"llvm.nearbyint.f32", Float(32), "nearbyint", {Float(32)}},
{"llvm.nearbyint.f64", Float(64), "nearbyint", {Float(64)}},
};
// clang-format on

void CodeGen_WebAssembly::init_module() {
CodeGen_Posix::init_module();
Expand Down Expand Up @@ -144,7 +142,6 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
Target::Feature required_feature;
};

// clang-format off
static const Pattern patterns[] = {
{"int_to_double", f64(wild_i32x_), Target::WasmSimd128},
{"int_to_double", f64(wild_u32x_), Target::WasmSimd128},
Expand All @@ -155,7 +152,6 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
{"widen_integer", i64(wild_i32x_), Target::WasmSimd128},
{"widen_integer", u64(wild_u32x_), Target::WasmSimd128},
};
// clang-format on

if (op->type.is_vector()) {
std::vector<Expr> matches;
Expand Down Expand Up @@ -193,7 +189,6 @@ void CodeGen_WebAssembly::visit(const Call *op) {
Target::Feature required_feature;
};

// clang-format off
static const Pattern patterns[] = {
{"q15mulr_sat_s", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15), Target::WasmSimd128},
{"saturating_narrow", i8_sat(wild_i16x_), Target::WasmSimd128},
Expand All @@ -213,7 +208,6 @@ void CodeGen_WebAssembly::visit(const Call *op) {
{u8_sat(wild_i32x_), u8_sat(i16_sat(wild_i32x_))},
{i8_sat(wild_i32x_), i8_sat(i16_sat(wild_i32x_))},
};
// clang-format on

if (op->type.is_vector()) {
std::vector<Expr> matches;
Expand Down Expand Up @@ -287,7 +281,7 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
const char *intrin;
Target::Feature required_feature;
};
// clang-format off

static const Pattern patterns[] = {
{VectorReduce::Add, 2, i16(wild_i8x_), "pairwise_widening_add", Target::WasmSimd128},
{VectorReduce::Add, 2, u16(wild_u8x_), "pairwise_widening_add", Target::WasmSimd128},
Expand All @@ -299,7 +293,6 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex

{VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Target::WasmSimd128},
};
// clang-format on

// Other values will be added soon, so this switch isn't actually pointless
using ValuePtr = llvm::Value *;
Expand Down
15 changes: 5 additions & 10 deletions src/CodeGen_X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ struct x86Intrinsic {
};
};

// clang-format off
const x86Intrinsic intrinsic_defs[] = {
// AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers
// just call `llvm.abs` (which requires a second argument).
Expand Down Expand Up @@ -295,17 +294,16 @@ const x86Intrinsic intrinsic_defs[] = {
{"tileloadd64_i8", Int(8, 1024), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
{"tileloadd64_i8", UInt(8, 1024), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
{"tileloadd64_bf16", BFloat(16, 512), "tile_load", {Int(16), Int(16), Handle(), Int(64), Int(64)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
{"tdpbssd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), Int(8, 1024)}, Target::AVX512_SapphireRapids},
{"tdpbssd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), Int(8, 1024)}, Target::AVX512_SapphireRapids},
{"tdpbsud", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), Int(8, 1024), UInt(8, 1024)}, Target::AVX512_SapphireRapids},
{"tdpbusd", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), UInt(8, 1024), Int(8, 1024)}, Target::AVX512_SapphireRapids},
{"tdpbuud", Int(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Int(32, 256), UInt(8, 1024), UInt(8, 1024)}, Target::AVX512_SapphireRapids},
{"tdpbf16ps", Float(32, 256), "tile_matmul", {Int(16), Int(16), Int(16), Float(32, 256), BFloat(16, 512), BFloat(16, 512)}, Target::AVX512_SapphireRapids},
{"tilezero_i32", Int(32, 256), "tile_zero", {Int(16), Int(16)}, Target::AVX512_SapphireRapids},
{"tilezero_i32", Int(32, 256), "tile_zero", {Int(16), Int(16)}, Target::AVX512_SapphireRapids},
{"tilezero_f32", Float(32, 256), "tile_zero", {Int(16), Int(16)}, Target::AVX512_SapphireRapids},
{"tilestored64_i32", Int(32), "tile_store", {Int(16), Int(16), Handle(), Int(64), Int(64), Int(32, 256)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
{"tilestored64_f32", Int(32), "tile_store", {Int(16), Int(16), Handle(), Int(64), Int(64), Float(32, 256)}, Target::AVX512_SapphireRapids, x86Intrinsic::AccessesMemory},
};
// clang-format on

void CodeGen_X86::init_module() {
CodeGen_Posix::init_module();
Expand Down Expand Up @@ -549,15 +547,13 @@ void CodeGen_X86::visit(const Cast *op) {
Expr pattern;
};

// clang-format off
static Pattern patterns[] = {
// This isn't rounding_mul_shift_right(i16, i16, 15) because it doesn't
// saturate the result.
{"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},

{"f32_to_bf16", bf16(wild_f32x_)},
};
// clang-format on

vector<Expr> matches;
for (const Pattern &p : patterns) {
Expand Down Expand Up @@ -783,7 +779,7 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
SingleArg = 1 << 2,
};
};
// clang-format off

// These patterns are roughly sorted "best to worst", in case there are two
// patterns that match the expression.
static const Pattern patterns[] = {
Expand Down Expand Up @@ -819,7 +815,6 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
{VectorReduce::Add, 8, u64(absd(wild_u8x_, wild_u8x_)), "sum_of_absolute_differences", {}},

};
// clang-format on

std::vector<Expr> matches;
for (const Pattern &p : patterns) {
Expand Down Expand Up @@ -1151,11 +1146,11 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const {
// type if we can.
int vec_bits = t.lanes() * t.bits();
int natural_vec_bits = target.natural_vector_size(t) * t.bits();
// clang-format off

int slice_bits = ((vec_bits > 256 && natural_vec_bits > 256) ? 512 :
(vec_bits > 128 && natural_vec_bits > 128) ? 256 :
128);
// clang-format on

return slice_bits / t.bits();
}

Expand Down
3 changes: 1 addition & 2 deletions src/Debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,10 @@ bool debug_is_active_impl(int verbosity, const char *file, const char *function,
* is determined by the value of the environment variable
* HL_DEBUG_CODEGEN
*/
// clang-format off

#define debug(n) \
/* NOLINTNEXTLINE(bugprone-macro-parentheses) */ \
if (debug_is_active((n))) std::cerr
// clang-format on

/** Allow easily printing the contents of containers, or std::vector-like containers,
* in debug output. Used like so:
Expand Down
6 changes: 2 additions & 4 deletions src/FindIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ class FindIntrinsics : public IRMutator {
// We can't do everything we want here with rewrite rules alone. So, we rewrite them
// to rounding_shifts with the widening still in place, and narrow it after the rewrite
// succeeds.
// clang-format off

if (rewrite(max(min(rounding_shift_right(x, y), upper), lower), rounding_shift_right(x, y), is_x_wide_int_or_uint) ||
rewrite(rounding_shift_right(x, y), rounding_shift_right(x, y), is_x_wide_int_or_uint) ||
rewrite(rounding_shift_left(x, y), rounding_shift_left(x, y), is_x_wide_int_or_uint) ||
Expand All @@ -759,7 +759,6 @@ class FindIntrinsics : public IRMutator {
}
}
}
// clang-format on
}

if (value.same_as(op->value)) {
Expand Down Expand Up @@ -892,7 +891,7 @@ class FindIntrinsics : public IRMutator {
}

if (no_overflow(op->type)) {
// clang-format off

if (rewrite(halving_add(x + y, 1), rounding_halving_add(x, y)) ||
rewrite(halving_add(x, y + 1), rounding_halving_add(x, y)) ||
rewrite(halving_add(x + 1, y), rounding_halving_add(x, y)) ||
Expand All @@ -903,7 +902,6 @@ class FindIntrinsics : public IRMutator {
false) {
return mutate(rewrite.result);
}
// clang-format on
}

// Move widening casts inside widening arithmetic outside the arithmetic,
Expand Down
3 changes: 1 addition & 2 deletions src/HexagonOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1184,7 +1184,7 @@ class VectorReducePatterns : public IRMutator {
// Map of instruction signatures
static const vector<Signature> sigs = ([&]() HALIDE_NEVER_INLINE {
return vector<Signature>{
// clang-format off

// --------- vrmpy ---------
// Sliding window
{4, 32, widening_mul(wild_u8x, wild_u8x), Signature::SlidingWindow | Signature::ScalarB},
Expand Down Expand Up @@ -1239,7 +1239,6 @@ class VectorReducePatterns : public IRMutator {
{2, 16, wild_u8x},
{2, 32, wild_i16x},
};
// clang-format on
})();

std::vector<Expr> matches;
Expand Down
Loading
Loading