From 0887efbff264e5bcce4ba7a345f8fe26f100a79c Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 15:39:09 -0700 Subject: [PATCH 01/13] clean-up abs and saturating_pmulhrs, fix AVX512 saturating_ ops --- src/CodeGen_X86.cpp | 53 +++++++++++++++++++++++++++++++++------- src/runtime/x86_avx2.ll | 31 ----------------------- src/runtime/x86_sse41.ll | 31 ----------------------- 3 files changed, 44 insertions(+), 71 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index a2393de008ce..d29e27c2b3c7 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -111,13 +111,13 @@ struct x86Intrinsic { // clang-format off const x86Intrinsic intrinsic_defs[] = { - {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, - {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, - {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, + {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, + {"llvm.x86.avx2.pabs.d", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, + {"llvm.x86.avx2.pabs.w", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2}, - {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, - {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, - {"abs_i32x4", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, + {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, + {"llvm.x86.ssse3.pabs.d.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, + {"llvm.x86.ssse3.pabs.w.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}}, {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41}, @@ -125,15 +125,23 @@ const x86Intrinsic intrinsic_defs[] = { {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX}, {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX}, + {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, + {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}}, {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}}, + {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, + {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}}, {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}}, + {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}}, + {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.ssub.sat.v8i16", Int(16, 8), "saturating_sub", {Int(16, 8), Int(16, 8)}}, @@ -149,13 +157,21 @@ const x86Intrinsic intrinsic_defs[] = { // Target::AVX instead of Target::AVX2 as the feature flag // requirement. // TODO: Just use llvm.*add/*sub.sat, and verify the above comment? + {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, + {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}}, + {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, + {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}}, + {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}}, + {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, {"psubuswx8", UInt(16, 8), "saturating_sub", {UInt(16, 8), UInt(16, 8)}}, @@ -180,14 +196,18 @@ const x86Intrinsic intrinsic_defs[] = { {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}}, // Multiply keep high half + {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2}, + {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2}, + {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, + {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, - {"saturating_pmulhrswx16", Int(16, 16), "saturating_pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}}, {"llvm.x86.sse2.pmulhu.w", UInt(16, 8), "pmulh", {UInt(16, 8), UInt(16, 8)}}, {"llvm.x86.ssse3.pmul.hr.sw.128", Int(16, 8), "pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41}, - {"saturating_pmulhrswx8", Int(16, 8), "saturating_pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41}, // Convert FP32 to BF16 {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_SapphireRapids}, @@ -582,7 +602,6 @@ void CodeGen_X86::visit(const Call *op) { static Pattern patterns[] = { {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)}, {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)}, - {"saturating_pmulhrs", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15)}, {"saturating_narrow", i16_sat(wild_i32x_)}, {"saturating_narrow", u16_sat(wild_i32x_)}, {"saturating_narrow", i8_sat(wild_i16x_)}, @@ -600,6 +619,22 @@ void CodeGen_X86::visit(const Call *op) { } } + // Special case of saturating_pmulhrs. + static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15); + if (expr_match(saturating_pmulhrs, op, matches)) { + // Rewrite so that we can take advantage of pmulhrs. + internal_assert(matches.size() == 2); + const Expr &a = matches[0]; + const Expr &b = matches[1]; + Expr pmulhrs = i16(rounding_shift_right(widening_mul(a, b), 15)); + // Handle edge case of possible overflow. + Expr i16_min = op->type.min(); + Expr i16_max = op->type.max(); + Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs); + expr.accept(this); + return; + } + CodeGen_Posix::visit(op); } diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll index d4d88be839c6..2de5d160d9be 100644 --- a/src/runtime/x86_avx2.ll +++ b/src/runtime/x86_avx2.ll @@ -31,37 +31,6 @@ define weak_odr <16 x i16> @packusdwx16(<16 x i32> %arg) nounwind alwaysinline } declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone -define weak_odr <32 x i8> @abs_i8x32(<32 x i8> %arg) { - %1 = sub <32 x i8> zeroinitializer, %arg - %2 = icmp sgt <32 x i8> %arg, zeroinitializer - %3 = select <32 x i1> %2, <32 x i8> %arg, <32 x i8> %1 - ret <32 x i8> %3 -} - -define weak_odr <16 x i16> @abs_i16x16(<16 x i16> %arg) { - %1 = sub <16 x i16> zeroinitializer, %arg - %2 = icmp sgt <16 x i16> %arg, zeroinitializer - %3 = select <16 x i1> %2, <16 x i16> %arg, <16 x i16> %1 - ret <16 x i16> %3 -} - -define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) { - %1 = sub <8 x i32> zeroinitializer, %arg - %2 = icmp sgt <8 x i32> %arg, zeroinitializer - %3 = select <8 x i1> %2, <8 x i32> %arg, <8 x i32> %1 - ret <8 x i32> %3 -} - -define weak_odr <16 x i16> @saturating_pmulhrswx16(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone alwaysinline { - %1 = tail call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a, <16 x i16> %b) - %2 = icmp eq <16 x i16> %a, - %3 = icmp eq <16 x i16> %b, - %4 = and <16 x i1> %2, %3 - %5 = select <16 x i1> %4, <16 x i16> , <16 x i16> %1 - ret <16 x i16> %5 -} -declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone - define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline { %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> ) ret <16 x i16> %1 diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll index d181de3d67e8..4b8bfb73aea9 100644 --- a/src/runtime/x86_sse41.ll +++ b/src/runtime/x86_sse41.ll @@ -51,37 +51,6 @@ define weak_odr <2 x double> @trunc_f64x2(<2 x double> %x) nounwind uwtable read ret <2 x double> %1 } -define weak_odr <16 x i8> @abs_i8x16(<16 x i8> %x) nounwind uwtable readnone alwaysinline { - %1 = sub <16 x i8> zeroinitializer, %x - %2 = icmp sgt <16 x i8> %x, zeroinitializer - %3 = select <16 x i1> %2, <16 x i8> %x, <16 x i8> %1 - ret <16 x i8> %3 -} - -define weak_odr <8 x i16> @abs_i16x8(<8 x i16> %x) nounwind uwtable readnone alwaysinline { - %1 = sub <8 x i16> zeroinitializer, %x - %2 = icmp sgt <8 x i16> %x, zeroinitializer - %3 = select <8 x i1> %2, <8 x i16> %x, <8 x i16> %1 - ret <8 x i16> %3 -} - -define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alwaysinline { - %1 = sub <4 x i32> zeroinitializer, %x - %2 = icmp sgt <4 x i32> %x, zeroinitializer - %3 = select <4 x i1> %2, <4 x i32> %x, <4 x i32> %1 - ret <4 x i32> %3 -} - -define weak_odr <8 x i16> @saturating_pmulhrswx8(<8 x i16> %a, <8 x i16> %b) nounwind uwtable readnone alwaysinline { - %1 = tail call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a, <8 x i16> %b) - %2 = icmp eq <8 x i16> %a, - %3 = icmp eq <8 x i16> %b, - %4 = and <8 x i1> %2, %3 - %5 = select <8 x i1> %4, <8 x i16> , <8 x i16> %1 - ret <8 x i16> %5 -} -declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone - define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline { %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> ) ret <8 x i16> %1 From 5e25f9376920b50ab5697a0b06a999a960efe116 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 15:39:53 -0700 Subject: [PATCH 02/13] add test coverage for AVX512 fp ops --- test/correctness/simd_op_check.cpp | 80 ++++++++++++++++-------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index e6c59b45f98f..3993db7de1ed 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -448,42 +448,50 @@ class SimdOpCheck : public SimdOpCheckTest { // AVX 2 if (use_avx2) { - check("vpaddb*ymm", 32, u8_1 + u8_2); - check("vpsubb*ymm", 32, u8_1 - u8_2); - check("vpaddsb*ymm", 32, i8_sat(i16(i8_1) + i16(i8_2))); - check("vpsubsb*ymm", 32, i8_sat(i16(i8_1) - i16(i8_2))); - check("vpaddusb*ymm", 32, u8(min(u16(u8_1) + u16(u8_2), max_u8))); - check("vpsubusb*ymm", 32, u8(max(i16(u8_1) - i16(u8_2), 0))); - check("vpaddw*ymm", 16, u16_1 + u16_2); - check("vpsubw*ymm", 16, u16_1 - u16_2); - check("vpaddsw*ymm", 16, i16_sat(i32(i16_1) + i32(i16_2))); - check("vpsubsw*ymm", 16, i16_sat(i32(i16_1) - i32(i16_2))); - check("vpaddusw*ymm", 16, u16(min(u32(u16_1) + u32(u16_2), max_u16))); - check("vpsubusw*ymm", 16, u16(max(i32(u16_1) - i32(u16_2), 0))); - check("vpaddd*ymm", 8, i32_1 + i32_2); - check("vpsubd*ymm", 8, i32_1 - i32_2); - check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) / (256 * 256))); - check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) >> cast(16))); - check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) >> cast(16))); - check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) << cast(-16))); - check("vpmullw*ymm", 16, i16_1 * i16_2); - - check("vpmulhrsw*ymm", 16, i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); - check("vpmulhrsw*ymm", 16, i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); - - check("vpcmp*b*ymm", 32, select(u8_1 == u8_2, u8(1), u8(2))); - check("vpcmp*b*ymm", 32, select(u8_1 > u8_2, u8(1), u8(2))); - check("vpcmp*w*ymm", 16, select(u16_1 == u16_2, u16(1), u16(2))); - check("vpcmp*w*ymm", 16, select(u16_1 > u16_2, u16(1), u16(2))); - check("vpcmp*d*ymm", 8, select(u32_1 == u32_2, u32(1), u32(2))); - check("vpcmp*d*ymm", 8, select(u32_1 > u32_2, u32(1), u32(2))); - - check("vpavgb*ymm", 32, u8((u16(u8_1) + u16(u8_2) + 1) / 2)); - check("vpavgw*ymm", 16, u16((u32(u16_1) + u32(u16_2) + 1) / 2)); - check("vpmaxsw*ymm", 16, max(i16_1, i16_2)); - check("vpminsw*ymm", 16, min(i16_1, i16_2)); - check("vpmaxub*ymm", 32, max(u8_1, u8_2)); - check("vpminub*ymm", 32, min(u8_1, u8_2)); + auto check_x86_fixed_point = [&](const std::string &suffix, const int m) { + check("vpaddb*" + suffix, 32 * m,u8_1 + u8_2); + check("vpsubb*" + suffix, 32 * m,u8_1 - u8_2); + check("vpaddsb*" + suffix, 32 * m,i8_sat(i16(i8_1) + i16(i8_2))); + check("vpsubsb*" + suffix, 32 * m,i8_sat(i16(i8_1) - i16(i8_2))); + check("vpaddusb*" + suffix, 32 * m,u8(min(u16(u8_1) + u16(u8_2), max_u8))); + check("vpsubusb*" + suffix, 32 * m,u8(max(i16(u8_1) - i16(u8_2), 0))); + check("vpaddw*" + suffix, 16 * m,u16_1 + u16_2); + check("vpsubw*" + suffix, 16 * m,u16_1 - u16_2); + check("vpaddsw*" + suffix, 16 * m,i16_sat(i32(i16_1) + i32(i16_2))); + check("vpsubsw*" + suffix, 16 * m,i16_sat(i32(i16_1) - i32(i16_2))); + check("vpaddusw*" + suffix, 16 * m,u16(min(u32(u16_1) + u32(u16_2), max_u16))); + check("vpsubusw*" + suffix, 16 * m,u16(max(i32(u16_1) - i32(u16_2), 0))); + check("vpaddd*" + suffix, 8 * m,i32_1 + i32_2); + check("vpsubd*" + suffix, 8 * m,i32_1 - i32_2); + check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) / (256 * 256))); + check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast(16))); + check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast(16))); + check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) << cast(-16))); + check("vpmullw*" + suffix, 16 * m,i16_1 * i16_2); + + check("vpmulhrsw*" + suffix, 16 * m,i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); + check("vpmulhrsw*" + suffix, 16 * m,i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); + + check("vpcmp*b*" + suffix, 32 * m,select(u8_1 == u8_2, u8(1), u8(2))); + check("vpcmp*b*" + suffix, 32 * m,select(u8_1 > u8_2, u8(1), u8(2))); + check("vpcmp*w*" + suffix, 16 * m,select(u16_1 == u16_2, u16(1), u16(2))); + check("vpcmp*w*" + suffix, 16 * m,select(u16_1 > u16_2, u16(1), u16(2))); + check("vpcmp*d*" + suffix, 8 * m,select(u32_1 == u32_2, u32(1), u32(2))); + check("vpcmp*d*" + suffix, 8 * m,select(u32_1 > u32_2, u32(1), u32(2))); + + check("vpavgb*" + suffix, 32 * m,u8((u16(u8_1) + u16(u8_2) + 1) / 2)); + check("vpavgw*" + suffix, 16 * m,u16((u32(u16_1) + u32(u16_2) + 1) / 2)); + check("vpmaxsw*" + suffix, 16 * m,max(i16_1, i16_2)); + check("vpminsw*" + suffix, 16 * m,min(i16_1, i16_2)); + check("vpmaxub*" + suffix, 32 * m,max(u8_1, u8_2)); + check("vpminub*" + suffix, 32 * m,min(u8_1, u8_2)); + }; + + check_x86_fixed_point("ymm", 1); + + if (use_avx512) { + check_x86_fixed_point("zmm", 2); + } check(use_avx512 ? "vpaddq*zmm" : "vpaddq*ymm", 8, i64_1 + i64_2); check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2); From 5d10a4a74a542b2ea84c9d2539639968b464c9d8 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 15:52:55 -0700 Subject: [PATCH 03/13] clang format --- test/correctness/simd_op_check.cpp | 72 +++++++++++++++--------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index 3993db7de1ed..5e2fc4b449f8 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -449,42 +449,42 @@ class SimdOpCheck : public SimdOpCheckTest { if (use_avx2) { auto check_x86_fixed_point = [&](const std::string &suffix, const int m) { - check("vpaddb*" + suffix, 32 * m,u8_1 + u8_2); - check("vpsubb*" + suffix, 32 * m,u8_1 - u8_2); - check("vpaddsb*" + suffix, 32 * m,i8_sat(i16(i8_1) + i16(i8_2))); - check("vpsubsb*" + suffix, 32 * m,i8_sat(i16(i8_1) - i16(i8_2))); - check("vpaddusb*" + suffix, 32 * m,u8(min(u16(u8_1) + u16(u8_2), max_u8))); - check("vpsubusb*" + suffix, 32 * m,u8(max(i16(u8_1) - i16(u8_2), 0))); - check("vpaddw*" + suffix, 16 * m,u16_1 + u16_2); - check("vpsubw*" + suffix, 16 * m,u16_1 - u16_2); - check("vpaddsw*" + suffix, 16 * m,i16_sat(i32(i16_1) + i32(i16_2))); - check("vpsubsw*" + suffix, 16 * m,i16_sat(i32(i16_1) - i32(i16_2))); - check("vpaddusw*" + suffix, 16 * m,u16(min(u32(u16_1) + u32(u16_2), max_u16))); - check("vpsubusw*" + suffix, 16 * m,u16(max(i32(u16_1) - i32(u16_2), 0))); - check("vpaddd*" + suffix, 8 * m,i32_1 + i32_2); - check("vpsubd*" + suffix, 8 * m,i32_1 - i32_2); - check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) / (256 * 256))); - check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast(16))); - check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast(16))); - check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) << cast(-16))); - check("vpmullw*" + suffix, 16 * m,i16_1 * i16_2); - - check("vpmulhrsw*" + suffix, 16 * m,i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); - check("vpmulhrsw*" + suffix, 16 * m,i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); - - check("vpcmp*b*" + suffix, 32 * m,select(u8_1 == u8_2, u8(1), u8(2))); - check("vpcmp*b*" + suffix, 32 * m,select(u8_1 > u8_2, u8(1), u8(2))); - check("vpcmp*w*" + suffix, 16 * m,select(u16_1 == u16_2, u16(1), u16(2))); - check("vpcmp*w*" + suffix, 16 * m,select(u16_1 > u16_2, u16(1), u16(2))); - check("vpcmp*d*" + suffix, 8 * m,select(u32_1 == u32_2, u32(1), u32(2))); - check("vpcmp*d*" + suffix, 8 * m,select(u32_1 > u32_2, u32(1), u32(2))); - - check("vpavgb*" + suffix, 32 * m,u8((u16(u8_1) + u16(u8_2) + 1) / 2)); - check("vpavgw*" + suffix, 16 * m,u16((u32(u16_1) + u32(u16_2) + 1) / 2)); - check("vpmaxsw*" + suffix, 16 * m,max(i16_1, i16_2)); - check("vpminsw*" + suffix, 16 * m,min(i16_1, i16_2)); - check("vpmaxub*" + suffix, 32 * m,max(u8_1, u8_2)); - check("vpminub*" + suffix, 32 * m,min(u8_1, u8_2)); + check("vpaddb*" + suffix, 32 * m, u8_1 + u8_2); + check("vpsubb*" + suffix, 32 * m, u8_1 - u8_2); + check("vpaddsb*" + suffix, 32 * m, i8_sat(i16(i8_1) + i16(i8_2))); + check("vpsubsb*" + suffix, 32 * m, i8_sat(i16(i8_1) - i16(i8_2))); + check("vpaddusb*" + suffix, 32 * m, u8(min(u16(u8_1) + u16(u8_2), max_u8))); + check("vpsubusb*" + suffix, 32 * m, u8(max(i16(u8_1) - i16(u8_2), 0))); + check("vpaddw*" + suffix, 16 * m, u16_1 + u16_2); + check("vpsubw*" + suffix, 16 * m, u16_1 - u16_2); + check("vpaddsw*" + suffix, 16 * m, i16_sat(i32(i16_1) + i32(i16_2))); + check("vpsubsw*" + suffix, 16 * m, i16_sat(i32(i16_1) - i32(i16_2))); + check("vpaddusw*" + suffix, 16 * m, u16(min(u32(u16_1) + u32(u16_2), max_u16))); + check("vpsubusw*" + suffix, 16 * m, u16(max(i32(u16_1) - i32(u16_2), 0))); + check("vpaddd*" + suffix, 8 * m, i32_1 + i32_2); + check("vpsubd*" + suffix, 8 * m, i32_1 - i32_2); + check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) / (256 * 256))); + check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) >> cast(16))); + check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) >> cast(16))); + check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) << cast(-16))); + check("vpmullw*" + suffix, 16 * m, i16_1 * i16_2); + + check("vpmulhrsw*" + suffix, 16 * m, i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); + check("vpmulhrsw*" + suffix, 16 * m, i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768)); + + check("vpcmp*b*" + suffix, 32 * m, select(u8_1 == u8_2, u8(1), u8(2))); + check("vpcmp*b*" + suffix, 32 * m, select(u8_1 > u8_2, u8(1), u8(2))); + check("vpcmp*w*" + suffix, 16 * m, select(u16_1 == u16_2, u16(1), u16(2))); + check("vpcmp*w*" + suffix, 16 * m, select(u16_1 > u16_2, u16(1), u16(2))); + check("vpcmp*d*" + suffix, 8 * m, select(u32_1 == u32_2, u32(1), u32(2))); + check("vpcmp*d*" + suffix, 8 * m, select(u32_1 > u32_2, u32(1), u32(2))); + + check("vpavgb*" + suffix, 32 * m, u8((u16(u8_1) + u16(u8_2) + 1) / 2)); + check("vpavgw*" + suffix, 16 * m, u16((u32(u16_1) + u32(u16_2) + 1) / 2)); + check("vpmaxsw*" + suffix, 16 * m, max(i16_1, i16_2)); + check("vpminsw*" + suffix, 16 * m, min(i16_1, i16_2)); + check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2)); + check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2)); }; check_x86_fixed_point("ymm", 1); From 060ec46c7251364e282758a7db8a4d164b17e7e7 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 17:16:01 -0700 Subject: [PATCH 04/13] update comment on saturating_pmulhrs --- src/CodeGen_X86.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index d29e27c2b3c7..e38a82f556f9 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -619,7 +619,9 @@ void CodeGen_X86::visit(const Call *op) { } } - // Special case of saturating_pmulhrs. + // Check for saturating_pmulhrs. On x86, pmulhrs is truncating, but it's still faster + // to use pmulhrs then to lower (producing widening multiplication), and have a check + // for the singular overflow case. static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15); if (expr_match(saturating_pmulhrs, op, matches)) { // Rewrite so that we can take advantage of pmulhrs. From 4ed4216fcf247c4395df9f13604be7db3d276953 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 17:17:19 -0700 Subject: [PATCH 05/13] comment typo --- src/CodeGen_X86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index e38a82f556f9..ca8c3154eaec 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -620,7 +620,7 @@ void CodeGen_X86::visit(const Call *op) { } // Check for saturating_pmulhrs. On x86, pmulhrs is truncating, but it's still faster - // to use pmulhrs then to lower (producing widening multiplication), and have a check + // to use pmulhrs than to lower (producing widening multiplication), and have a check // for the singular overflow case. static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15); if (expr_match(saturating_pmulhrs, op, matches)) { From 6f12b544aa2e93e0341c58e369e6353fa87ad17f Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 17:31:46 -0700 Subject: [PATCH 06/13] add comments explaining duplicate Skylake/Cannonlake intrinsics --- src/CodeGen_X86.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index ca8c3154eaec..d36412534289 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -125,21 +125,25 @@ const x86Intrinsic intrinsic_defs[] = { {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX}, {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}}, {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}}, {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2}, @@ -157,19 +161,23 @@ const x86Intrinsic intrinsic_defs[] = { // Target::AVX instead of Target::AVX2 as the feature flag // requirement. // TODO: Just use llvm.*add/*sub.sat, and verify the above comment? + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, @@ -196,12 +204,15 @@ const x86Intrinsic intrinsic_defs[] = { {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}}, // Multiply keep high half + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh. {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh. {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2}, + // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulhrs. {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, From f6eb8aad4d6fbfebf2c077bd7cab71677dff8346 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 26 Oct 2022 17:34:58 -0700 Subject: [PATCH 07/13] remove Cannonlake duplicates --- src/CodeGen_X86.cpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index d36412534289..95473452b985 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -125,26 +125,18 @@ const x86Intrinsic intrinsic_defs[] = { {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX}, {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. - {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}}, {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. - {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}}, {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. - {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. - {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.ssub.sat.v8i16", Int(16, 8), "saturating_sub", {Int(16, 8), Int(16, 8)}}, @@ -161,24 +153,16 @@ const x86Intrinsic intrinsic_defs[] = { // Target::AVX instead of Target::AVX2 as the feature flag // requirement. // TODO: Just use llvm.*add/*sub.sat, and verify the above comment? - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. - {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. - {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake}, {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake}, {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX}, {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add. - {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub. - {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX}, {"psubuswx8", UInt(16, 8), "saturating_sub", {UInt(16, 8), UInt(16, 8)}}, @@ -204,16 +188,10 @@ const x86Intrinsic intrinsic_defs[] = { {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}}, // Multiply keep high half - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh. - {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh. - {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2}, - // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulhrs. - {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake}, {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake}, {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}}, From 4645dd746238c184d06988b6433432ed2b3a0a6b Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 27 Oct 2022 10:52:11 -0700 Subject: [PATCH 08/13] fix d <-> w typo --- src/CodeGen_X86.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 95473452b985..cf7b7cb4f5d1 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -112,12 +112,12 @@ struct x86Intrinsic { // clang-format off const x86Intrinsic intrinsic_defs[] = { {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, - {"llvm.x86.avx2.pabs.d", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, - {"llvm.x86.avx2.pabs.w", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, + {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, + {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2}, {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, - {"llvm.x86.ssse3.pabs.d.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, - {"llvm.x86.ssse3.pabs.w.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, + {"llvm.x86.ssse3.pabs.w.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, + {"llvm.x86.ssse3.pabs.d.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}}, {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41}, From cddce67a4c0515878382693bc4d3d2f5650f3342 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 27 Oct 2022 11:49:25 -0700 Subject: [PATCH 09/13] on ssse3 use llvm.abs --- src/CodeGen_X86.cpp | 6 +++--- src/runtime/x86_sse41.ll | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index cf7b7cb4f5d1..595c62e7d513 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -115,9 +115,9 @@ const x86Intrinsic intrinsic_defs[] = { {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2}, - {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, - {"llvm.x86.ssse3.pabs.w.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, - {"llvm.x86.ssse3.pabs.d.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, + {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, + {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, + {"abs_i32x4", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41}, {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}}, {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41}, diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll index 4b8bfb73aea9..41302eb967e8 100644 --- a/src/runtime/x86_sse41.ll +++ b/src/runtime/x86_sse41.ll @@ -51,6 +51,24 @@ define weak_odr <2 x double> @trunc_f64x2(<2 x double> %x) nounwind uwtable read ret <2 x double> %1 } +define weak_odr <16 x i8> @abs_i8x16(<16 x i8> %x) nounwind uwtable readnone alwaysinline { + %1 = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %x, i1 false) + ret <16 x i8> %1 +} +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) nounwind readnone + +define weak_odr <8 x i16> @abs_i16x8(<8 x i16> %x) nounwind uwtable readnone alwaysinline { + %1 = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %x, i1 false) + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) nounwind readnone + +define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alwaysinline { + %1 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %x, i1 false) + ret <4 x i32> %1 +} +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) nounwind readnone + define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline { %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> ) ret <8 x i16> %1 From 9d0d7eccaab3dab8b75b4f87f487cdf041947c3a Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 27 Oct 2022 13:58:14 -0700 Subject: [PATCH 10/13] don't use llvm.avx2.pabs intrinsics --- src/CodeGen_X86.cpp | 8 +++++--- src/runtime/x86_avx2.ll | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 595c62e7d513..0f60b2b72b14 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -111,9 +111,11 @@ struct x86Intrinsic { // clang-format off const x86Intrinsic intrinsic_defs[] = { - {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, - {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, - {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, + // AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers + // just call `llvm.abs` (which requires a second argument). + {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, + {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, + {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2}, {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41}, {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41}, diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll index 2de5d160d9be..f89f6d502e30 100644 --- a/src/runtime/x86_avx2.ll +++ b/src/runtime/x86_avx2.ll @@ -31,6 +31,24 @@ define weak_odr <16 x i16> @packusdwx16(<16 x i32> %arg) nounwind alwaysinline } declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone +define weak_odr <32 x i8> @abs_i8x32(<32 x i8> %arg) { + %1 = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false) + ret <32 x i8> %1 +} +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone + +define weak_odr <16 x i16> @abs_i16x16(<16 x i16> %arg) { + %1 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false) + ret <16 x i16> %1 +} +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone + +define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) { + %1 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false) + ret <8 x i32> %1 +} +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone + define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline { %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> ) ret <16 x i16> %1 From 4d2733df91ec28d344411358ea4734c29cb552bf Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 27 Oct 2022 14:22:56 -0700 Subject: [PATCH 11/13] generate vpabs on AVX512 --- src/CodeGen_X86.cpp | 7 +++++++ src/runtime/x86_avx512.ll | 18 ++++++++++++++++++ test/correctness/simd_op_check.cpp | 9 ++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 0f60b2b72b14..3b95f2ac84a5 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -35,6 +35,9 @@ Target complete_x86_target(Target t) { if (t.has_feature(Target::AVX512_Cannonlake) || t.has_feature(Target::AVX512_Skylake) || t.has_feature(Target::AVX512_KNL)) { + t.set_feature(Target::AVX512); + } + if (t.has_feature(Target::AVX512)) { t.set_feature(Target::AVX2); } if (t.has_feature(Target::AVX2)) { @@ -113,6 +116,10 @@ struct x86Intrinsic { const x86Intrinsic intrinsic_defs[] = { // AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers // just call `llvm.abs` (which requires a second argument). + // AVX512BW's pabs instructions aren't directly exposed by LLVM. + {"abs_i8x64", UInt(8, 64), "abs", {Int(8, 64)}, Target::AVX512_Skylake}, + {"abs_i16x32", UInt(16, 32), "abs", {Int(16, 32)}, Target::AVX512_Skylake}, + {"abs_i32x16", UInt(32, 16), "abs", {Int(32, 16)}, Target::AVX512_Skylake}, {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2}, {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2}, {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2}, diff --git a/src/runtime/x86_avx512.ll b/src/runtime/x86_avx512.ll index 8cbc8abb9c5d..22401897eee2 100644 --- a/src/runtime/x86_avx512.ll +++ b/src/runtime/x86_avx512.ll @@ -138,3 +138,21 @@ define weak_odr <4 x i32> @dpwssdsx4(<4 x i32> %init, <8 x i16> %a, <8 x i16> % ret <4 x i32> %3 } declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define weak_odr <64 x i8> @abs_i8x64(<64 x i8> %arg) { + %1 = tail call <64 x i8> @llvm.abs.v64i8(<64 x i8> %arg, i1 false) + ret <64 x i8> %1 +} +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) nounwind readnone + +define weak_odr <32 x i16> @abs_i16x32(<32 x i16> %arg) { + %1 = tail call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false) + ret <32 x i16> %1 +} +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) nounwind readnone + +define weak_odr <16 x i32> @abs_i32x16(<16 x i32> %arg) { + %1 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %arg, i1 false) + ret <16 x i32> %1 +} +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) nounwind readnone diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index 5e2fc4b449f8..de5385cbf31c 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -485,6 +485,12 @@ class SimdOpCheck : public SimdOpCheckTest { check("vpminsw*" + suffix, 16 * m, min(i16_1, i16_2)); check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2)); check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2)); + + + check("vpabsb*" + suffix, 32 * m, abs(i8_1)); + check("vpabsw*" + suffix, 16 * m, abs(i16_1)); + check("vpabsd*" + suffix, 8 * m, abs(i32_1)); + }; check_x86_fixed_point("ymm", 1); @@ -497,9 +503,6 @@ class SimdOpCheck : public SimdOpCheckTest { check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2); check(use_avx512 ? "vpmullq" : "vpmuludq*ymm", 8, u64_1 * u64_2); - check("vpabsb*ymm", 32, abs(i8_1)); - check("vpabsw*ymm", 16, abs(i16_1)); - check("vpabsd*ymm", 8, abs(i32_1)); // llvm doesn't distinguish between signed and unsigned multiplies // check("vpmuldq", 8, i64(i32_1) * i64(i32_2)); From dd0a98c0ced383f3555b8878bc83b0553691500d Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 27 Oct 2022 14:24:35 -0700 Subject: [PATCH 12/13] clang format --- test/correctness/simd_op_check.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index de5385cbf31c..946f342c1a16 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -486,11 +486,9 @@ class SimdOpCheck : public SimdOpCheckTest { check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2)); check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2)); - check("vpabsb*" + suffix, 32 * m, abs(i8_1)); check("vpabsw*" + suffix, 16 * m, abs(i16_1)); check("vpabsd*" + suffix, 8 * m, abs(i32_1)); - }; check_x86_fixed_point("ymm", 1); @@ -503,7 +501,6 @@ class SimdOpCheck : public SimdOpCheckTest { check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2); check(use_avx512 ? "vpmullq" : "vpmuludq*ymm", 8, u64_1 * u64_2); - // llvm doesn't distinguish between signed and unsigned multiplies // check("vpmuldq", 8, i64(i32_1) * i64(i32_2)); if (!use_avx512) { From 41f09f83f7e108aa4856401db09a24d6ec559a70 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Fri, 28 Oct 2022 13:44:15 -0700 Subject: [PATCH 13/13] faster AVX2 lowering of saturating_pmulhrs --- src/CodeGen_X86.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 3b95f2ac84a5..551798dca649 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -624,14 +624,27 @@ void CodeGen_X86::visit(const Call *op) { if (expr_match(saturating_pmulhrs, op, matches)) { // Rewrite so that we can take advantage of pmulhrs. internal_assert(matches.size() == 2); + internal_assert(op->type.element_of() == Int(16)); const Expr &a = matches[0]; const Expr &b = matches[1]; + Expr pmulhrs = i16(rounding_shift_right(widening_mul(a, b), 15)); - // Handle edge case of possible overflow. + Expr i16_min = op->type.min(); Expr i16_max = op->type.max(); - Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs); - expr.accept(this); + + // Handle edge case of possible overflow. + // See https://github.com/halide/Halide/pull/7129/files#r1008331426 + // On AVX512 (and with enough lanes) we can use a mask register. + if (target.has_feature(Target::AVX512) && op->type.lanes() >= 32) { + Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs); + expr.accept(this); + } else { + Expr mask = select(max(a, b) == i16_min, cast(op->type, -1), cast(op->type, 0)); + Expr expr = mask ^ pmulhrs; + expr.accept(this); + } + return; }