From 28642b3bbad4ed5865616271c2c1816a45620ede Mon Sep 17 00:00:00 2001 From: Johnnie Birch Date: Wed, 14 Jul 2021 09:23:58 -0700 Subject: [PATCH] Add support for Saturating Rounding Q-format Multiplication for x64 --- build.rs | 1 - cranelift/codegen/src/isa/x64/inst/args.rs | 3 ++ cranelift/codegen/src/isa/x64/inst/emit.rs | 2 +- cranelift/codegen/src/isa/x64/lower.rs | 42 ++++++++++++++++++++-- 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/build.rs b/build.rs index 47852cf61815..6ac0a53f75cf 100644 --- a/build.rs +++ b/build.rs @@ -190,7 +190,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool { match (testsuite, testname) { ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true, - ("simd", "simd_i16x8_q15mulr_sat_s") => return true, ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true, ("simd", "simd_i32x4_trunc_sat_f64x2") => return true, ("simd", "simd_int_to_int_extend") => return true, diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 2c7ce30090d0..ebb80f0d3d45 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -596,6 +596,7 @@ pub enum SseOpcode { Pmuldq, Pmulhw, Pmulhuw, + Pmulhrsw, Pmulld, Pmullw, Pmuludq, @@ -785,6 +786,7 @@ impl SseOpcode { | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Palignr + | SseOpcode::Pmulhrsw | SseOpcode::Pshufb => SSSE3, SseOpcode::Blendvpd @@ -966,6 +968,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pmuldq => "pmuldq", SseOpcode::Pmulhw => "pmulhw", SseOpcode::Pmulhuw => "pmulhuw", + SseOpcode::Pmulhrsw => "pmulhrsw", SseOpcode::Pmulld => "pmulld", SseOpcode::Pmullw => "pmullw", SseOpcode::Pmuludq => "pmuludq", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 47ae56fb2277..6d02c2b62a53 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1511,6 +1511,7 @@ pub(crate) fn emit( SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3), SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3), SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2), + SseOpcode::Pmulhrsw => (LegacyPrefixes::_66, 0x0F380B, 3), SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2), SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3), SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2), @@ -1755,7 +1756,6 @@ pub(crate) fn emit( let (prefix, opcode) = match op { SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29), SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29), - SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F), SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F), SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11), SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index fcb5e71355c8..ac2ad4f93437 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -6405,9 +6405,45 @@ fn lower_insn_to_regs>( } }, + Opcode::SqmulRoundSat => { + // Lane-wise saturating rounding multiplication in Q15 format + // Optimial lowering taken from instruction proposal https://github.com/WebAssembly/simd/pull/365 + // y = i16x8.q15mulr_sat_s(a, b) is lowered to: + //MOVDQA xmm_y, xmm_a + //MOVDQA xmm_tmp, wasm_i16x8_splat(0x8000) + //PMULHRSW xmm_y, xmm_b + //PCMPEQW xmm_tmp, xmm_y + //PXOR xmm_y, xmm_tmp + let input_ty = ctx.input_ty(insn, 0); + let src1 = put_input_in_reg(ctx, inputs[0]); + let src2 = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + + ctx.emit(Inst::gen_move(dst, src1, input_ty)); + static SAT_MASK: [u8; 16] = [ + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, + ]; + let mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&SAT_MASK)); + let mask = ctx.alloc_tmp(types::I16X8).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mask_const, mask, types::I16X8)); + + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhrsw, RegMem::reg(src2), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pcmpeqw, + RegMem::reg(dst.to_reg()), + mask, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(mask.to_reg()), + dst, + )); + } + // Unimplemented opcodes below. These are not currently used by Wasm // lowering or other known embeddings, but should be either supported or - // removed eventually. + // removed eventually . Opcode::Uload8x8Complex | Opcode::Sload8x8Complex | Opcode::Uload16x4Complex @@ -6435,8 +6471,8 @@ fn lower_insn_to_regs>( unimplemented!("Vector split/concat ops not implemented."); } - Opcode::SqmulRoundSat | Opcode::Uunarrow => { - unimplemented!("unimplemented lowering for opcode {:?}", op) + Opcode::Uunarrow => { + unimplemented!("unimplemented lowering for opcode {:?}", op); } // Opcodes that should be removed by legalization. These should