From 28642b3bbad4ed5865616271c2c1816a45620ede Mon Sep 17 00:00:00 2001
From: Johnnie Birch <johnnie.l.birch.jr@intel.com>
Date: Wed, 14 Jul 2021 09:23:58 -0700
Subject: [PATCH] Add support for Saturating Rounding Q-format Multiplication
 for x64

---
 build.rs                                   |  1 -
 cranelift/codegen/src/isa/x64/inst/args.rs |  3 ++
 cranelift/codegen/src/isa/x64/inst/emit.rs |  2 +-
 cranelift/codegen/src/isa/x64/lower.rs     | 42 ++++++++++++++++++++--
 4 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/build.rs b/build.rs
index 47852cf61815..6ac0a53f75cf 100644
--- a/build.rs
+++ b/build.rs
@@ -190,7 +190,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
 
     match (testsuite, testname) {
         ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
-        ("simd", "simd_i16x8_q15mulr_sat_s") => return true,
         ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
         ("simd", "simd_i32x4_trunc_sat_f64x2") => return true,
         ("simd", "simd_int_to_int_extend") => return true,
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 2c7ce30090d0..ebb80f0d3d45 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -596,6 +596,7 @@ pub enum SseOpcode {
     Pmuldq,
     Pmulhw,
     Pmulhuw,
+    Pmulhrsw,
     Pmulld,
     Pmullw,
     Pmuludq,
@@ -785,6 +786,7 @@ impl SseOpcode {
             | SseOpcode::Pabsw
             | SseOpcode::Pabsd
             | SseOpcode::Palignr
+            | SseOpcode::Pmulhrsw
             | SseOpcode::Pshufb => SSSE3,
 
             SseOpcode::Blendvpd
@@ -966,6 +968,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Pmuldq => "pmuldq",
             SseOpcode::Pmulhw => "pmulhw",
             SseOpcode::Pmulhuw => "pmulhuw",
+            SseOpcode::Pmulhrsw => "pmulhrsw",
             SseOpcode::Pmulld => "pmulld",
             SseOpcode::Pmullw => "pmullw",
             SseOpcode::Pmuludq => "pmuludq",
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 47ae56fb2277..6d02c2b62a53 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1511,6 +1511,7 @@ pub(crate) fn emit(
                 SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
                 SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3),
                 SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2),
+                SseOpcode::Pmulhrsw => (LegacyPrefixes::_66, 0x0F380B, 3),
                 SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2),
                 SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
                 SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
@@ -1755,7 +1756,6 @@ pub(crate) fn emit(
             let (prefix, opcode) = match op {
                 SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
                 SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
-                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F),
                 SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
                 SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
                 SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index fcb5e71355c8..ac2ad4f93437 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -6405,9 +6405,45 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         },
 
+        Opcode::SqmulRoundSat => {
+            // Lane-wise saturating rounding multiplication in Q15 format
+            // Optimial lowering taken from instruction proposal https://github.com/WebAssembly/simd/pull/365
+            // y = i16x8.q15mulr_sat_s(a, b) is lowered to:
+            //MOVDQA xmm_y, xmm_a
+            //MOVDQA xmm_tmp, wasm_i16x8_splat(0x8000)
+            //PMULHRSW xmm_y, xmm_b
+            //PCMPEQW xmm_tmp, xmm_y
+            //PXOR xmm_y, xmm_tmp
+            let input_ty = ctx.input_ty(insn, 0);
+            let src1 = put_input_in_reg(ctx, inputs[0]);
+            let src2 = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+            ctx.emit(Inst::gen_move(dst, src1, input_ty));
+            static SAT_MASK: [u8; 16] = [
+                0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+                0x00, 0x80,
+            ];
+            let mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&SAT_MASK));
+            let mask = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
+            ctx.emit(Inst::xmm_load_const(mask_const, mask, types::I16X8));
+
+            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhrsw, RegMem::reg(src2), dst));
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pcmpeqw,
+                RegMem::reg(dst.to_reg()),
+                mask,
+            ));
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pxor,
+                RegMem::reg(mask.to_reg()),
+                dst,
+            ));
+        }
+
         // Unimplemented opcodes below. These are not currently used by Wasm
         // lowering or other known embeddings, but should be either supported or
-        // removed eventually.
+        // removed eventually .
         Opcode::Uload8x8Complex
         | Opcode::Sload8x8Complex
         | Opcode::Uload16x4Complex
@@ -6435,8 +6471,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             unimplemented!("Vector split/concat ops not implemented.");
         }
 
-        Opcode::SqmulRoundSat | Opcode::Uunarrow => {
-            unimplemented!("unimplemented lowering for opcode {:?}", op)
+        Opcode::Uunarrow => {
+            unimplemented!("unimplemented lowering for opcode {:?}", op);
         }
 
         // Opcodes that should be removed by legalization. These should