From 0887efbff264e5bcce4ba7a345f8fe26f100a79c Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 15:39:09 -0700
Subject: [PATCH 01/13] clean-up abs and saturating_pmulhrs, fix AVX512
 saturating_ ops

---
 src/CodeGen_X86.cpp      | 53 +++++++++++++++++++++++++++++++++-------
 src/runtime/x86_avx2.ll  | 31 -----------------------
 src/runtime/x86_sse41.ll | 31 -----------------------
 3 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index a2393de008ce..d29e27c2b3c7 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -111,13 +111,13 @@ struct x86Intrinsic {
 
 // clang-format off
 const x86Intrinsic intrinsic_defs[] = {
-    {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
-    {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
-    {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
+    {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
+    {"llvm.x86.avx2.pabs.d", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
+    {"llvm.x86.avx2.pabs.w", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
     {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2},
-    {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
-    {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
-    {"abs_i32x4", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
+    {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
+    {"llvm.x86.ssse3.pabs.d.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
+    {"llvm.x86.ssse3.pabs.w.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
     {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}},
 
     {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41},
@@ -125,15 +125,23 @@ const x86Intrinsic intrinsic_defs[] = {
     {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX},
     {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX},
 
+    {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
+    {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}},
     {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}},
+    {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
+    {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}},
     {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}},
 
+    {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}},
+    {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.ssub.sat.v8i16", Int(16, 8), "saturating_sub", {Int(16, 8), Int(16, 8)}},
 
@@ -149,13 +157,21 @@ const x86Intrinsic intrinsic_defs[] = {
     // Target::AVX instead of Target::AVX2 as the feature flag
     // requirement.
     // TODO: Just use llvm.*add/*sub.sat, and verify the above comment?
+    {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
+    {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}},
+    {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
+    {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}},
 
+    {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
     {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}},
+    {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
     {"psubuswx8", UInt(16, 8), "saturating_sub", {UInt(16, 8), UInt(16, 8)}},
 
@@ -180,14 +196,18 @@ const x86Intrinsic intrinsic_defs[] = {
     {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}},
 
     // Multiply keep high half
+    {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2},
+    {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2},
+    {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
+    {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},
-    {"saturating_pmulhrswx16", Int(16, 16), "saturating_pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}},
     {"llvm.x86.sse2.pmulhu.w", UInt(16, 8), "pmulh", {UInt(16, 8), UInt(16, 8)}},
     {"llvm.x86.ssse3.pmul.hr.sw.128", Int(16, 8), "pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41},
-    {"saturating_pmulhrswx8", Int(16, 8), "saturating_pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41},
 
     // Convert FP32 to BF16
     {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_SapphireRapids},
@@ -582,7 +602,6 @@ void CodeGen_X86::visit(const Call *op) {
     static Pattern patterns[] = {
         {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)},
         {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)},
-        {"saturating_pmulhrs", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15)},
         {"saturating_narrow", i16_sat(wild_i32x_)},
         {"saturating_narrow", u16_sat(wild_i32x_)},
         {"saturating_narrow", i8_sat(wild_i16x_)},
@@ -600,6 +619,22 @@ void CodeGen_X86::visit(const Call *op) {
         }
     }
 
+    // Special case of saturating_pmulhrs.
+    static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15);
+    if (expr_match(saturating_pmulhrs, op, matches)) {
+        // Rewrite so that we can take advantage of pmulhrs.
+        internal_assert(matches.size() == 2);
+        const Expr &a = matches[0];
+        const Expr &b = matches[1];
+        Expr pmulhrs = i16(rounding_shift_right(widening_mul(a, b), 15));
+        // Handle edge case of possible overflow.
+        Expr i16_min = op->type.min();
+        Expr i16_max = op->type.max();
+        Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs);
+        expr.accept(this);
+        return;
+    }
+
     CodeGen_Posix::visit(op);
 }
 
diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll
index d4d88be839c6..2de5d160d9be 100644
--- a/src/runtime/x86_avx2.ll
+++ b/src/runtime/x86_avx2.ll
@@ -31,37 +31,6 @@ define weak_odr <16 x i16>  @packusdwx16(<16 x i32> %arg) nounwind alwaysinline
 }
 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
 
-define weak_odr <32 x i8> @abs_i8x32(<32 x i8> %arg) {
- %1 = sub <32 x i8> zeroinitializer, %arg
- %2 = icmp sgt <32 x i8> %arg, zeroinitializer
- %3 = select <32 x i1> %2, <32 x i8> %arg, <32 x i8> %1
- ret <32 x i8> %3
-}
-
-define weak_odr <16 x i16> @abs_i16x16(<16 x i16> %arg) {
- %1 = sub <16 x i16> zeroinitializer, %arg
- %2 = icmp sgt <16 x i16> %arg, zeroinitializer
- %3 = select <16 x i1> %2, <16 x i16> %arg, <16 x i16> %1
- ret <16 x i16> %3
-}
-
-define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) {
- %1 = sub <8 x i32> zeroinitializer, %arg
- %2 = icmp sgt <8 x i32> %arg, zeroinitializer
- %3 = select <8 x i1> %2, <8 x i32> %arg, <8 x i32> %1
- ret <8 x i32> %3
-}
-
-define weak_odr <16 x i16> @saturating_pmulhrswx16(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone alwaysinline {
-  %1 = tail call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a, <16 x i16> %b)
-  %2 = icmp eq <16 x i16> %a, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %3 = icmp eq <16 x i16> %b, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %4 = and <16 x i1> %2, %3
-  %5 = select <16 x i1> %4, <16 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>, <16 x i16> %1
-  ret <16 x i16> %5
-}
-declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
-
 define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline {
   %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i16> %1
diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll
index d181de3d67e8..4b8bfb73aea9 100644
--- a/src/runtime/x86_sse41.ll
+++ b/src/runtime/x86_sse41.ll
@@ -51,37 +51,6 @@ define weak_odr <2 x double> @trunc_f64x2(<2 x double> %x) nounwind uwtable read
   ret <2 x double> %1
 }
 
-define weak_odr <16 x i8> @abs_i8x16(<16 x i8> %x) nounwind uwtable readnone alwaysinline {
-  %1 = sub <16 x i8> zeroinitializer, %x
-  %2 = icmp sgt <16 x i8> %x, zeroinitializer
-  %3 = select <16 x i1> %2, <16 x i8> %x, <16 x i8> %1
-  ret <16 x i8> %3
-}
-
-define weak_odr <8 x i16> @abs_i16x8(<8 x i16> %x) nounwind uwtable readnone alwaysinline {
-  %1 = sub <8 x i16> zeroinitializer, %x
-  %2 = icmp sgt <8 x i16> %x, zeroinitializer
-  %3 = select <8 x i1> %2, <8 x i16> %x, <8 x i16> %1
-  ret <8 x i16> %3
-}
-
-define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alwaysinline {
-  %1 = sub <4 x i32> zeroinitializer, %x
-  %2 = icmp sgt <4 x i32> %x, zeroinitializer
-  %3 = select <4 x i1> %2, <4 x i32> %x, <4 x i32> %1
-  ret <4 x i32> %3
-}
-
-define weak_odr <8 x i16> @saturating_pmulhrswx8(<8 x i16> %a, <8 x i16> %b) nounwind uwtable readnone alwaysinline {
-  %1 = tail call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = icmp eq <8 x i16> %a, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %3 = icmp eq <8 x i16> %b, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %4 = and <8 x i1> %2, %3
-  %5 = select <8 x i1> %4, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>, <8 x i16> %1
-  ret <8 x i16> %5
-}
-declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
-
 define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline {
   %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i16> %1

From 5e25f9376920b50ab5697a0b06a999a960efe116 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 15:39:53 -0700
Subject: [PATCH 02/13] add test coverage for AVX512 fp ops

---
 test/correctness/simd_op_check.cpp | 80 ++++++++++++++++--------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index e6c59b45f98f..3993db7de1ed 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -448,42 +448,50 @@ class SimdOpCheck : public SimdOpCheckTest {
         // AVX 2
 
         if (use_avx2) {
-            check("vpaddb*ymm", 32, u8_1 + u8_2);
-            check("vpsubb*ymm", 32, u8_1 - u8_2);
-            check("vpaddsb*ymm", 32, i8_sat(i16(i8_1) + i16(i8_2)));
-            check("vpsubsb*ymm", 32, i8_sat(i16(i8_1) - i16(i8_2)));
-            check("vpaddusb*ymm", 32, u8(min(u16(u8_1) + u16(u8_2), max_u8)));
-            check("vpsubusb*ymm", 32, u8(max(i16(u8_1) - i16(u8_2), 0)));
-            check("vpaddw*ymm", 16, u16_1 + u16_2);
-            check("vpsubw*ymm", 16, u16_1 - u16_2);
-            check("vpaddsw*ymm", 16, i16_sat(i32(i16_1) + i32(i16_2)));
-            check("vpsubsw*ymm", 16, i16_sat(i32(i16_1) - i32(i16_2)));
-            check("vpaddusw*ymm", 16, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
-            check("vpsubusw*ymm", 16, u16(max(i32(u16_1) - i32(u16_2), 0)));
-            check("vpaddd*ymm", 8, i32_1 + i32_2);
-            check("vpsubd*ymm", 8, i32_1 - i32_2);
-            check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) / (256 * 256)));
-            check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) >> cast<unsigned>(16)));
-            check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) >> cast<int>(16)));
-            check("vpmulhw*ymm", 16, i16((i32(i16_1) * i32(i16_2)) << cast<int>(-16)));
-            check("vpmullw*ymm", 16, i16_1 * i16_2);
-
-            check("vpmulhrsw*ymm", 16, i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
-            check("vpmulhrsw*ymm", 16, i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
-
-            check("vpcmp*b*ymm", 32, select(u8_1 == u8_2, u8(1), u8(2)));
-            check("vpcmp*b*ymm", 32, select(u8_1 > u8_2, u8(1), u8(2)));
-            check("vpcmp*w*ymm", 16, select(u16_1 == u16_2, u16(1), u16(2)));
-            check("vpcmp*w*ymm", 16, select(u16_1 > u16_2, u16(1), u16(2)));
-            check("vpcmp*d*ymm", 8, select(u32_1 == u32_2, u32(1), u32(2)));
-            check("vpcmp*d*ymm", 8, select(u32_1 > u32_2, u32(1), u32(2)));
-
-            check("vpavgb*ymm", 32, u8((u16(u8_1) + u16(u8_2) + 1) / 2));
-            check("vpavgw*ymm", 16, u16((u32(u16_1) + u32(u16_2) + 1) / 2));
-            check("vpmaxsw*ymm", 16, max(i16_1, i16_2));
-            check("vpminsw*ymm", 16, min(i16_1, i16_2));
-            check("vpmaxub*ymm", 32, max(u8_1, u8_2));
-            check("vpminub*ymm", 32, min(u8_1, u8_2));
+            auto check_x86_fixed_point = [&](const std::string &suffix, const int m) {
+                check("vpaddb*" + suffix, 32 * m,u8_1 + u8_2);
+                check("vpsubb*" + suffix, 32 * m,u8_1 - u8_2);
+                check("vpaddsb*" + suffix, 32 * m,i8_sat(i16(i8_1) + i16(i8_2)));
+                check("vpsubsb*" + suffix, 32 * m,i8_sat(i16(i8_1) - i16(i8_2)));
+                check("vpaddusb*" + suffix, 32 * m,u8(min(u16(u8_1) + u16(u8_2), max_u8)));
+                check("vpsubusb*" + suffix, 32 * m,u8(max(i16(u8_1) - i16(u8_2), 0)));
+                check("vpaddw*" + suffix, 16 * m,u16_1 + u16_2);
+                check("vpsubw*" + suffix, 16 * m,u16_1 - u16_2);
+                check("vpaddsw*" + suffix, 16 * m,i16_sat(i32(i16_1) + i32(i16_2)));
+                check("vpsubsw*" + suffix, 16 * m,i16_sat(i32(i16_1) - i32(i16_2)));
+                check("vpaddusw*" + suffix, 16 * m,u16(min(u32(u16_1) + u32(u16_2), max_u16)));
+                check("vpsubusw*" + suffix, 16 * m,u16(max(i32(u16_1) - i32(u16_2), 0)));
+                check("vpaddd*" + suffix, 8 * m,i32_1 + i32_2);
+                check("vpsubd*" + suffix, 8 * m,i32_1 - i32_2);
+                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) / (256 * 256)));
+                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast<unsigned>(16)));
+                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast<int>(16)));
+                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) << cast<int>(-16)));
+                check("vpmullw*" + suffix, 16 * m,i16_1 * i16_2);
+
+                check("vpmulhrsw*" + suffix, 16 * m,i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
+                check("vpmulhrsw*" + suffix, 16 * m,i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
+
+                check("vpcmp*b*" + suffix, 32 * m,select(u8_1 == u8_2, u8(1), u8(2)));
+                check("vpcmp*b*" + suffix, 32 * m,select(u8_1 > u8_2, u8(1), u8(2)));
+                check("vpcmp*w*" + suffix, 16 * m,select(u16_1 == u16_2, u16(1), u16(2)));
+                check("vpcmp*w*" + suffix, 16 * m,select(u16_1 > u16_2, u16(1), u16(2)));
+                check("vpcmp*d*" + suffix, 8 * m,select(u32_1 == u32_2, u32(1), u32(2)));
+                check("vpcmp*d*" + suffix, 8 * m,select(u32_1 > u32_2, u32(1), u32(2)));
+
+                check("vpavgb*" + suffix, 32 * m,u8((u16(u8_1) + u16(u8_2) + 1) / 2));
+                check("vpavgw*" + suffix, 16 * m,u16((u32(u16_1) + u32(u16_2) + 1) / 2));
+                check("vpmaxsw*" + suffix, 16 * m,max(i16_1, i16_2));
+                check("vpminsw*" + suffix, 16 * m,min(i16_1, i16_2));
+                check("vpmaxub*" + suffix, 32 * m,max(u8_1, u8_2));
+                check("vpminub*" + suffix, 32 * m,min(u8_1, u8_2));
+            };
+
+            check_x86_fixed_point("ymm", 1);
+
+            if (use_avx512) {
+                check_x86_fixed_point("zmm", 2);
+            }
 
             check(use_avx512 ? "vpaddq*zmm" : "vpaddq*ymm", 8, i64_1 + i64_2);
             check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2);

From 5d10a4a74a542b2ea84c9d2539639968b464c9d8 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 15:52:55 -0700
Subject: [PATCH 03/13] clang format

---
 test/correctness/simd_op_check.cpp | 72 +++++++++++++++---------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 3993db7de1ed..5e2fc4b449f8 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -449,42 +449,42 @@ class SimdOpCheck : public SimdOpCheckTest {
 
         if (use_avx2) {
             auto check_x86_fixed_point = [&](const std::string &suffix, const int m) {
-                check("vpaddb*" + suffix, 32 * m,u8_1 + u8_2);
-                check("vpsubb*" + suffix, 32 * m,u8_1 - u8_2);
-                check("vpaddsb*" + suffix, 32 * m,i8_sat(i16(i8_1) + i16(i8_2)));
-                check("vpsubsb*" + suffix, 32 * m,i8_sat(i16(i8_1) - i16(i8_2)));
-                check("vpaddusb*" + suffix, 32 * m,u8(min(u16(u8_1) + u16(u8_2), max_u8)));
-                check("vpsubusb*" + suffix, 32 * m,u8(max(i16(u8_1) - i16(u8_2), 0)));
-                check("vpaddw*" + suffix, 16 * m,u16_1 + u16_2);
-                check("vpsubw*" + suffix, 16 * m,u16_1 - u16_2);
-                check("vpaddsw*" + suffix, 16 * m,i16_sat(i32(i16_1) + i32(i16_2)));
-                check("vpsubsw*" + suffix, 16 * m,i16_sat(i32(i16_1) - i32(i16_2)));
-                check("vpaddusw*" + suffix, 16 * m,u16(min(u32(u16_1) + u32(u16_2), max_u16)));
-                check("vpsubusw*" + suffix, 16 * m,u16(max(i32(u16_1) - i32(u16_2), 0)));
-                check("vpaddd*" + suffix, 8 * m,i32_1 + i32_2);
-                check("vpsubd*" + suffix, 8 * m,i32_1 - i32_2);
-                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) / (256 * 256)));
-                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast<unsigned>(16)));
-                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) >> cast<int>(16)));
-                check("vpmulhw*" + suffix, 16 * m,i16((i32(i16_1) * i32(i16_2)) << cast<int>(-16)));
-                check("vpmullw*" + suffix, 16 * m,i16_1 * i16_2);
-
-                check("vpmulhrsw*" + suffix, 16 * m,i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
-                check("vpmulhrsw*" + suffix, 16 * m,i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
-
-                check("vpcmp*b*" + suffix, 32 * m,select(u8_1 == u8_2, u8(1), u8(2)));
-                check("vpcmp*b*" + suffix, 32 * m,select(u8_1 > u8_2, u8(1), u8(2)));
-                check("vpcmp*w*" + suffix, 16 * m,select(u16_1 == u16_2, u16(1), u16(2)));
-                check("vpcmp*w*" + suffix, 16 * m,select(u16_1 > u16_2, u16(1), u16(2)));
-                check("vpcmp*d*" + suffix, 8 * m,select(u32_1 == u32_2, u32(1), u32(2)));
-                check("vpcmp*d*" + suffix, 8 * m,select(u32_1 > u32_2, u32(1), u32(2)));
-
-                check("vpavgb*" + suffix, 32 * m,u8((u16(u8_1) + u16(u8_2) + 1) / 2));
-                check("vpavgw*" + suffix, 16 * m,u16((u32(u16_1) + u32(u16_2) + 1) / 2));
-                check("vpmaxsw*" + suffix, 16 * m,max(i16_1, i16_2));
-                check("vpminsw*" + suffix, 16 * m,min(i16_1, i16_2));
-                check("vpmaxub*" + suffix, 32 * m,max(u8_1, u8_2));
-                check("vpminub*" + suffix, 32 * m,min(u8_1, u8_2));
+                check("vpaddb*" + suffix, 32 * m, u8_1 + u8_2);
+                check("vpsubb*" + suffix, 32 * m, u8_1 - u8_2);
+                check("vpaddsb*" + suffix, 32 * m, i8_sat(i16(i8_1) + i16(i8_2)));
+                check("vpsubsb*" + suffix, 32 * m, i8_sat(i16(i8_1) - i16(i8_2)));
+                check("vpaddusb*" + suffix, 32 * m, u8(min(u16(u8_1) + u16(u8_2), max_u8)));
+                check("vpsubusb*" + suffix, 32 * m, u8(max(i16(u8_1) - i16(u8_2), 0)));
+                check("vpaddw*" + suffix, 16 * m, u16_1 + u16_2);
+                check("vpsubw*" + suffix, 16 * m, u16_1 - u16_2);
+                check("vpaddsw*" + suffix, 16 * m, i16_sat(i32(i16_1) + i32(i16_2)));
+                check("vpsubsw*" + suffix, 16 * m, i16_sat(i32(i16_1) - i32(i16_2)));
+                check("vpaddusw*" + suffix, 16 * m, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
+                check("vpsubusw*" + suffix, 16 * m, u16(max(i32(u16_1) - i32(u16_2), 0)));
+                check("vpaddd*" + suffix, 8 * m, i32_1 + i32_2);
+                check("vpsubd*" + suffix, 8 * m, i32_1 - i32_2);
+                check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) / (256 * 256)));
+                check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) >> cast<unsigned>(16)));
+                check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) >> cast<int>(16)));
+                check("vpmulhw*" + suffix, 16 * m, i16((i32(i16_1) * i32(i16_2)) << cast<int>(-16)));
+                check("vpmullw*" + suffix, 16 * m, i16_1 * i16_2);
+
+                check("vpmulhrsw*" + suffix, 16 * m, i16((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
+                check("vpmulhrsw*" + suffix, 16 * m, i16_sat((((i32(i16_1) * i32(i16_2)) + 16384)) / 32768));
+
+                check("vpcmp*b*" + suffix, 32 * m, select(u8_1 == u8_2, u8(1), u8(2)));
+                check("vpcmp*b*" + suffix, 32 * m, select(u8_1 > u8_2, u8(1), u8(2)));
+                check("vpcmp*w*" + suffix, 16 * m, select(u16_1 == u16_2, u16(1), u16(2)));
+                check("vpcmp*w*" + suffix, 16 * m, select(u16_1 > u16_2, u16(1), u16(2)));
+                check("vpcmp*d*" + suffix, 8 * m, select(u32_1 == u32_2, u32(1), u32(2)));
+                check("vpcmp*d*" + suffix, 8 * m, select(u32_1 > u32_2, u32(1), u32(2)));
+
+                check("vpavgb*" + suffix, 32 * m, u8((u16(u8_1) + u16(u8_2) + 1) / 2));
+                check("vpavgw*" + suffix, 16 * m, u16((u32(u16_1) + u32(u16_2) + 1) / 2));
+                check("vpmaxsw*" + suffix, 16 * m, max(i16_1, i16_2));
+                check("vpminsw*" + suffix, 16 * m, min(i16_1, i16_2));
+                check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2));
+                check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2));
             };
 
             check_x86_fixed_point("ymm", 1);

From 060ec46c7251364e282758a7db8a4d164b17e7e7 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 17:16:01 -0700
Subject: [PATCH 04/13] update comment on saturating_pmulhrs

---
 src/CodeGen_X86.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index d29e27c2b3c7..e38a82f556f9 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -619,7 +619,9 @@ void CodeGen_X86::visit(const Call *op) {
         }
     }
 
-    // Special case of saturating_pmulhrs.
+    // Check for saturating_pmulhrs. On x86, pmulhrs is truncating, but it's still faster
+    // to use pmulhrs then to lower (producing widening multiplication), and have a check
+    // for the singular overflow case.
     static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15);
     if (expr_match(saturating_pmulhrs, op, matches)) {
         // Rewrite so that we can take advantage of pmulhrs.

From 4ed4216fcf247c4395df9f13604be7db3d276953 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 17:17:19 -0700
Subject: [PATCH 05/13] comment typo

---
 src/CodeGen_X86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index e38a82f556f9..ca8c3154eaec 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -620,7 +620,7 @@ void CodeGen_X86::visit(const Call *op) {
     }
 
     // Check for saturating_pmulhrs. On x86, pmulhrs is truncating, but it's still faster
-    // to use pmulhrs then to lower (producing widening multiplication), and have a check
+    // to use pmulhrs than to lower (producing widening multiplication), and have a check
     // for the singular overflow case.
     static Expr saturating_pmulhrs = rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15);
     if (expr_match(saturating_pmulhrs, op, matches)) {

From 6f12b544aa2e93e0341c58e369e6353fa87ad17f Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 17:31:46 -0700
Subject: [PATCH 06/13] add comments explaining duplicate Skylake/Cannonlake
 intrinsics

---
 src/CodeGen_X86.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index ca8c3154eaec..d36412534289 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -125,21 +125,25 @@ const x86Intrinsic intrinsic_defs[] = {
     {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX},
     {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX},
 
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
     {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}},
     {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
     {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}},
     {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}},
 
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
     {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
     {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2},
@@ -157,19 +161,23 @@ const x86Intrinsic intrinsic_defs[] = {
     // Target::AVX instead of Target::AVX2 as the feature flag
     // requirement.
     // TODO: Just use llvm.*add/*sub.sat, and verify the above comment?
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
     {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
     {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}},
 
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
     {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
     {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
     {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
@@ -196,12 +204,15 @@ const x86Intrinsic intrinsic_defs[] = {
     {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}},
 
     // Multiply keep high half
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh.
     {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh.
     {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2},
+    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulhrs.
     {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},

From f6eb8aad4d6fbfebf2c077bd7cab71677dff8346 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 26 Oct 2022 17:34:58 -0700
Subject: [PATCH 07/13] remove Cannonlake duplicates

---
 src/CodeGen_X86.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index d36412534289..95473452b985 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -125,26 +125,18 @@ const x86Intrinsic intrinsic_defs[] = {
     {"round_f32x8", Float(32, 8), "round", {Float(32, 8)}, Target::AVX},
     {"round_f64x4", Float(64, 4), "round", {Float(64, 4)}, Target::AVX},
 
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
-    {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.sadd.sat.v64i8", Int(8, 64), "saturating_add", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v32i8", Int(8, 32), "saturating_add", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.sadd.sat.v16i8", Int(8, 16), "saturating_add", {Int(8, 16), Int(8, 16)}},
     {"llvm.sadd.sat.v8i8", Int(8, 8), "saturating_add", {Int(8, 8), Int(8, 8)}},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
-    {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.ssub.sat.v64i8", Int(8, 64), "saturating_sub", {Int(8, 64), Int(8, 64)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v32i8", Int(8, 32), "saturating_sub", {Int(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.ssub.sat.v16i8", Int(8, 16), "saturating_sub", {Int(8, 16), Int(8, 16)}},
     {"llvm.ssub.sat.v8i8", Int(8, 8), "saturating_sub", {Int(8, 8), Int(8, 8)}},
 
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
-    {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.sadd.sat.v32i16", Int(16, 32), "saturating_add", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.sadd.sat.v16i16", Int(16, 16), "saturating_add", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.sadd.sat.v8i16", Int(16, 8), "saturating_add", {Int(16, 8), Int(16, 8)}},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
-    {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.ssub.sat.v32i16", Int(16, 32), "saturating_sub", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.ssub.sat.v16i16", Int(16, 16), "saturating_sub", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.ssub.sat.v8i16", Int(16, 8), "saturating_sub", {Int(16, 8), Int(16, 8)}},
@@ -161,24 +153,16 @@ const x86Intrinsic intrinsic_defs[] = {
     // Target::AVX instead of Target::AVX2 as the feature flag
     // requirement.
     // TODO: Just use llvm.*add/*sub.sat, and verify the above comment?
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
-    {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.uadd.sat.v64i8", UInt(8, 64), "saturating_add", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"paddusbx32", UInt(8, 32), "saturating_add", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"paddusbx16", UInt(8, 16), "saturating_add", {UInt(8, 16), UInt(8, 16)}},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
-    {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Cannonlake},
     {"llvm.usub.sat.v64i8", UInt(8, 64), "saturating_sub", {UInt(8, 64), UInt(8, 64)}, Target::AVX512_Skylake},
     {"psubusbx32", UInt(8, 32), "saturating_sub", {UInt(8, 32), UInt(8, 32)}, Target::AVX},
     {"psubusbx16", UInt(8, 16), "saturating_sub", {UInt(8, 16), UInt(8, 16)}},
 
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_add.
-    {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.uadd.sat.v32i16", UInt(16, 32), "saturating_add", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"padduswx16", UInt(16, 16), "saturating_add", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
     {"padduswx8", UInt(16, 8), "saturating_add", {UInt(16, 8), UInt(16, 8)}},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit saturating_sub.
-    {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.usub.sat.v32i16", UInt(16, 32), "saturating_sub", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"psubuswx16", UInt(16, 16), "saturating_sub", {UInt(16, 16), UInt(16, 16)}, Target::AVX},
     {"psubuswx8", UInt(16, 8), "saturating_sub", {UInt(16, 8), UInt(16, 8)}},
@@ -204,16 +188,10 @@ const x86Intrinsic intrinsic_defs[] = {
     {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}},
 
     // Multiply keep high half
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh.
-    {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmulh.w.512", Int(16, 32), "pmulh", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulh.
-    {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmulhu.w.512", UInt(16, 32), "pmulh", {UInt(16, 32), UInt(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2},
-    // Cannonlake and Skylake support AVX512BW, allowing for 512-bit pmulhrs.
-    {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Cannonlake},
     {"llvm.x86.avx512.pmul.hr.sw.512", Int(16, 32), "pmulhrs", {Int(16, 32), Int(16, 32)}, Target::AVX512_Skylake},
     {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}},

From 4645dd746238c184d06988b6433432ed2b3a0a6b Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 27 Oct 2022 10:52:11 -0700
Subject: [PATCH 08/13] fix d <-> w typo

---
 src/CodeGen_X86.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 95473452b985..cf7b7cb4f5d1 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -112,12 +112,12 @@ struct x86Intrinsic {
 // clang-format off
 const x86Intrinsic intrinsic_defs[] = {
     {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
-    {"llvm.x86.avx2.pabs.d", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
-    {"llvm.x86.avx2.pabs.w", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
+    {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
+    {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
     {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2},
     {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
-    {"llvm.x86.ssse3.pabs.d.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
-    {"llvm.x86.ssse3.pabs.w.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
+    {"llvm.x86.ssse3.pabs.w.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
+    {"llvm.x86.ssse3.pabs.d.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
     {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}},
 
     {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41},

From cddce67a4c0515878382693bc4d3d2f5650f3342 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 27 Oct 2022 11:49:25 -0700
Subject: [PATCH 09/13] on ssse3 use llvm.abs

---
 src/CodeGen_X86.cpp      |  6 +++---
 src/runtime/x86_sse41.ll | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index cf7b7cb4f5d1..595c62e7d513 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -115,9 +115,9 @@ const x86Intrinsic intrinsic_defs[] = {
     {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
     {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
     {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2},
-    {"llvm.x86.ssse3.pabs.b.128", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
-    {"llvm.x86.ssse3.pabs.w.128", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
-    {"llvm.x86.ssse3.pabs.d.128", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
+    {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
+    {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
+    {"abs_i32x4", UInt(32, 4), "abs", {Int(32, 4)}, Target::SSE41},
     {"abs_f32x4", Float(32, 4), "abs", {Float(32, 4)}},
 
     {"round_f32x4", Float(32, 4), "round", {Float(32, 4)}, Target::SSE41},
diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll
index 4b8bfb73aea9..41302eb967e8 100644
--- a/src/runtime/x86_sse41.ll
+++ b/src/runtime/x86_sse41.ll
@@ -51,6 +51,24 @@ define weak_odr <2 x double> @trunc_f64x2(<2 x double> %x) nounwind uwtable read
   ret <2 x double> %1
 }
 
+define weak_odr <16 x i8> @abs_i8x16(<16 x i8> %x) nounwind uwtable readnone alwaysinline {
+  %1 = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %x, i1 false)
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) nounwind readnone
+
+define weak_odr <8 x i16> @abs_i16x8(<8 x i16> %x) nounwind uwtable readnone alwaysinline {
+  %1 = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %x, i1 false)
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) nounwind readnone
+
+define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alwaysinline {
+  %1 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %x, i1 false)
+  ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) nounwind readnone
+
 define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline {
   %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i16> %1

From 9d0d7eccaab3dab8b75b4f87f487cdf041947c3a Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 27 Oct 2022 13:58:14 -0700
Subject: [PATCH 10/13] don't use llvm.avx2.pabs intrinsics

---
 src/CodeGen_X86.cpp     |  8 +++++---
 src/runtime/x86_avx2.ll | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 595c62e7d513..0f60b2b72b14 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -111,9 +111,11 @@ struct x86Intrinsic {
 
 // clang-format off
 const x86Intrinsic intrinsic_defs[] = {
-    {"llvm.x86.avx2.pabs.b", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
-    {"llvm.x86.avx2.pabs.w", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
-    {"llvm.x86.avx2.pabs.d", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
+    // AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers
+    // just call `llvm.abs` (which requires a second argument).
+    {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
+    {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
+    {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
     {"abs_f32x8", Float(32, 8), "abs", {Float(32, 8)}, Target::AVX2},
     {"abs_i8x16", UInt(8, 16), "abs", {Int(8, 16)}, Target::SSE41},
     {"abs_i16x8", UInt(16, 8), "abs", {Int(16, 8)}, Target::SSE41},
diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll
index 2de5d160d9be..f89f6d502e30 100644
--- a/src/runtime/x86_avx2.ll
+++ b/src/runtime/x86_avx2.ll
@@ -31,6 +31,24 @@ define weak_odr <16 x i16>  @packusdwx16(<16 x i32> %arg) nounwind alwaysinline
 }
 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
 
+define weak_odr <32 x i8> @abs_i8x32(<32 x i8> %arg) {
+ %1 = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
+ ret <32 x i8> %1
+}
+declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
+
+define weak_odr <16 x i16> @abs_i16x16(<16 x i16> %arg) {
+ %1 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
+ ret <16 x i16> %1
+}
+declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
+
+define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) {
+ %1 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
+ ret <8 x i32> %1
+}
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
+
 define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline {
   %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i16> %1

From 4d2733df91ec28d344411358ea4734c29cb552bf Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 27 Oct 2022 14:22:56 -0700
Subject: [PATCH 11/13] generate vpabs on AVX512

---
 src/CodeGen_X86.cpp                |  7 +++++++
 src/runtime/x86_avx512.ll          | 18 ++++++++++++++++++
 test/correctness/simd_op_check.cpp |  9 ++++++---
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 0f60b2b72b14..3b95f2ac84a5 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -35,6 +35,9 @@ Target complete_x86_target(Target t) {
     if (t.has_feature(Target::AVX512_Cannonlake) ||
         t.has_feature(Target::AVX512_Skylake) ||
         t.has_feature(Target::AVX512_KNL)) {
+        t.set_feature(Target::AVX512);
+    }
+    if (t.has_feature(Target::AVX512)) {
         t.set_feature(Target::AVX2);
     }
     if (t.has_feature(Target::AVX2)) {
@@ -113,6 +116,10 @@ struct x86Intrinsic {
 const x86Intrinsic intrinsic_defs[] = {
     // AVX2/SSSE3 LLVM intrinsics for pabs fail in JIT. The integer wrappers
     // just call `llvm.abs` (which requires a second argument).
+    // AVX512BW's pabs instructions aren't directly exposed by LLVM.
+    {"abs_i8x64", UInt(8, 64), "abs", {Int(8, 64)}, Target::AVX512_Skylake},
+    {"abs_i16x32", UInt(16, 32), "abs", {Int(16, 32)}, Target::AVX512_Skylake},
+    {"abs_i32x16", UInt(32, 16), "abs", {Int(32, 16)}, Target::AVX512_Skylake},
     {"abs_i8x32", UInt(8, 32), "abs", {Int(8, 32)}, Target::AVX2},
     {"abs_i16x16", UInt(16, 16), "abs", {Int(16, 16)}, Target::AVX2},
     {"abs_i32x8", UInt(32, 8), "abs", {Int(32, 8)}, Target::AVX2},
diff --git a/src/runtime/x86_avx512.ll b/src/runtime/x86_avx512.ll
index 8cbc8abb9c5d..22401897eee2 100644
--- a/src/runtime/x86_avx512.ll
+++ b/src/runtime/x86_avx512.ll
@@ -138,3 +138,21 @@ define weak_odr <4 x i32>  @dpwssdsx4(<4 x i32> %init, <8 x i16> %a, <8 x i16> %
   ret <4 x i32> %3
 }
 declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define weak_odr <64 x i8> @abs_i8x64(<64 x i8> %arg) {
+ %1 = tail call <64 x i8> @llvm.abs.v64i8(<64 x i8> %arg, i1 false)
+ ret <64 x i8> %1
+}
+declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) nounwind readnone
+
+define weak_odr <32 x i16> @abs_i16x32(<32 x i16> %arg) {
+ %1 = tail call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
+ ret <32 x i16> %1
+}
+declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) nounwind readnone
+
+define weak_odr <16 x i32> @abs_i32x16(<16 x i32> %arg) {
+ %1 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %arg, i1 false)
+ ret <16 x i32> %1
+}
+declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) nounwind readnone
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 5e2fc4b449f8..de5385cbf31c 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -485,6 +485,12 @@ class SimdOpCheck : public SimdOpCheckTest {
                 check("vpminsw*" + suffix, 16 * m, min(i16_1, i16_2));
                 check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2));
                 check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2));
+
+
+                check("vpabsb*" + suffix, 32 * m, abs(i8_1));
+                check("vpabsw*" + suffix, 16 * m, abs(i16_1));
+                check("vpabsd*" + suffix, 8 * m, abs(i32_1));
+
             };
 
             check_x86_fixed_point("ymm", 1);
@@ -497,9 +503,6 @@ class SimdOpCheck : public SimdOpCheckTest {
             check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2);
             check(use_avx512 ? "vpmullq" : "vpmuludq*ymm", 8, u64_1 * u64_2);
 
-            check("vpabsb*ymm", 32, abs(i8_1));
-            check("vpabsw*ymm", 16, abs(i16_1));
-            check("vpabsd*ymm", 8, abs(i32_1));
 
             // llvm doesn't distinguish between signed and unsigned multiplies
             // check("vpmuldq", 8, i64(i32_1) * i64(i32_2));

From dd0a98c0ced383f3555b8878bc83b0553691500d Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 27 Oct 2022 14:24:35 -0700
Subject: [PATCH 12/13] clang format

---
 test/correctness/simd_op_check.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index de5385cbf31c..946f342c1a16 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -486,11 +486,9 @@ class SimdOpCheck : public SimdOpCheckTest {
                 check("vpmaxub*" + suffix, 32 * m, max(u8_1, u8_2));
                 check("vpminub*" + suffix, 32 * m, min(u8_1, u8_2));
 
-
                 check("vpabsb*" + suffix, 32 * m, abs(i8_1));
                 check("vpabsw*" + suffix, 16 * m, abs(i16_1));
                 check("vpabsd*" + suffix, 8 * m, abs(i32_1));
-
             };
 
             check_x86_fixed_point("ymm", 1);
@@ -503,7 +501,6 @@ class SimdOpCheck : public SimdOpCheckTest {
             check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2);
             check(use_avx512 ? "vpmullq" : "vpmuludq*ymm", 8, u64_1 * u64_2);
 
-
             // llvm doesn't distinguish between signed and unsigned multiplies
             // check("vpmuldq", 8, i64(i32_1) * i64(i32_2));
             if (!use_avx512) {

From 41f09f83f7e108aa4856401db09a24d6ec559a70 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Fri, 28 Oct 2022 13:44:15 -0700
Subject: [PATCH 13/13] faster AVX2 lowering of saturating_pmulhrs

---
 src/CodeGen_X86.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 3b95f2ac84a5..551798dca649 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -624,14 +624,27 @@ void CodeGen_X86::visit(const Call *op) {
     if (expr_match(saturating_pmulhrs, op, matches)) {
         // Rewrite so that we can take advantage of pmulhrs.
         internal_assert(matches.size() == 2);
+        internal_assert(op->type.element_of() == Int(16));
         const Expr &a = matches[0];
         const Expr &b = matches[1];
+
         Expr pmulhrs = i16(rounding_shift_right(widening_mul(a, b), 15));
-        // Handle edge case of possible overflow.
+
         Expr i16_min = op->type.min();
         Expr i16_max = op->type.max();
-        Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs);
-        expr.accept(this);
+
+        // Handle edge case of possible overflow.
+        // See https://github.com/halide/Halide/pull/7129/files#r1008331426
+        // On AVX512 (and with enough lanes) we can use a mask register.
+        if (target.has_feature(Target::AVX512) && op->type.lanes() >= 32) {
+            Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs);
+            expr.accept(this);
+        } else {
+            Expr mask = select(max(a, b) == i16_min, cast(op->type, -1), cast(op->type, 0));
+            Expr expr = mask ^ pmulhrs;
+            expr.accept(this);
+        }
+
         return;
     }