From e373ddfe1b892ec428249204ab31239111062df2 Mon Sep 17 00:00:00 2001 From: Johnnie Birch Date: Mon, 14 Jun 2021 17:20:40 -0700 Subject: [PATCH] Add extend-add-pairwise instructions x64 --- build.rs | 8 -- .../codegen/meta/src/shared/instructions.rs | 29 ++++- .../codegen/src/isa/aarch64/lower_inst.rs | 6 +- cranelift/codegen/src/isa/s390x/lower.rs | 4 +- cranelift/codegen/src/isa/x64/inst/args.rs | 3 + cranelift/codegen/src/isa/x64/inst/emit.rs | 1 + cranelift/codegen/src/isa/x64/lower.rs | 122 ++++++++++++++++++ cranelift/codegen/src/preopt.serialized | Bin 5511 -> 5511 bytes cranelift/interpreter/src/step.rs | 2 + cranelift/wasm/src/code_translator.rs | 22 +++- 10 files changed, 180 insertions(+), 17 deletions(-) diff --git a/build.rs b/build.rs index 7569c61c9d37..20de13c6e2ea 100644 --- a/build.rs +++ b/build.rs @@ -190,9 +190,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool { } match (testsuite, testname) { - ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true, - ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true, - ("simd", _) => return false, _ => {} } false @@ -220,11 +217,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", _) if cfg!(feature = "old-x86-backend") => return true, // No simd support yet for s390x. ("simd", _) if platform_is_s390x() => return true, - - // These are new instructions that are not really implemented in any backend. - ("simd", "simd_i16x8_extadd_pairwise_i8x16") - | ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true, - _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 6b79a5e92f4e..9603ecc7f134 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -4114,7 +4114,34 @@ pub(crate) fn define( Inst::new( "uwiden_high", r#" - Widen the high lanes of `x` using unsigned extension. + Lane-wise integer extended pairwise addition producing extended results + (twice wider results than the input) + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "extended_pairwise_add_signed", + r#" + Widen the high lanes of `x` using signed extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "extended_pairwise_add_unsigned", + r#" + Widen the high lanes of `x` extending with zeros. This will double the lane width and halve the number of lanes. "#, diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 754e2f7b9501..a3670e823f38 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -3519,7 +3519,11 @@ pub(crate) fn lower_insn_to_regs>( }); } - Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => unimplemented!("lowering {}", op), + Opcode::ExtendedPairwiseAddSigned + | Opcode::ExtendedPairwiseAddUnsigned + | Opcode::ConstAddr + | Opcode::Vconcat + | Opcode::Vsplit => unimplemented!("lowering {}", op), } Ok(()) diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index 8ab66add04c1..8ae6b32ecaad 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -2868,7 +2868,9 @@ fn lower_insn_to_regs>( | Opcode::WideningPairwiseDotProductS | Opcode::SqmulRoundSat | Opcode::FvpromoteLow - | Opcode::Fvdemote => { + | Opcode::Fvdemote + | Opcode::ExtendedPairwiseAddSigned + | Opcode::ExtendedPairwiseAddUnsigned => { // TODO unimplemented!("Vector ops not implemented."); } diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 915ddfe347b0..f279ee909604 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -568,6 +568,7 @@ pub enum SseOpcode { Pinsrb, Pinsrw, Pinsrd, + Pmaddubsw, Pmaddwd, Pmaxsb, Pmaxsw, @@ -746,6 +747,7 @@ impl SseOpcode { | SseOpcode::Pcmpgtd | SseOpcode::Pextrw | SseOpcode::Pinsrw + | SseOpcode::Pmaddubsw | SseOpcode::Pmaddwd | SseOpcode::Pmaxsw | SseOpcode::Pmaxub @@ -944,6 +946,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrd => "pinsrd", + SseOpcode::Pmaddubsw => "pmaddubsw", SseOpcode::Pmaddwd => "pmaddwd", SseOpcode::Pmaxsb => "pmaxsb", SseOpcode::Pmaxsw => "pmaxsw", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 954fd98a5b71..93132a2aabe0 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1483,6 +1483,7 @@ pub(crate) fn emit( SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2), SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2), SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), + SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3), SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2), SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 657df3268447..f9f824cc23dc 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4927,6 +4927,128 @@ fn lower_insn_to_regs>( } } } + Opcode::ExtendedPairwiseAddSigned | Opcode::ExtendedPairwiseAddUnsigned => { + // Extended pairwise addition instructions computes extended sums within adjacent + // pairs of lanes of a SIMD vector, producing a SIMD vector with half as many lanes. + // Instruction sequences taken from instruction SPEC PR https://github.com/WebAssembly/simd/pull/380 + /* + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + unreachable!(); + match op { + Opcode::ExtendedPairwiseAddSigned => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + static MUL_CONST: [u8; 16] = [0x01; 16]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16)); + ctx.emit(Inst::xmm_mov( + SseOpcode::Movdqa, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddubsw, RegMem::reg(src), dst)); + } + (types::I16X8, types::I32X4) => { + static MUL_CONST: [u8; 16] = [ + 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, + ]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I16X8)); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddwd, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + } + _ => unreachable!( + "Type pattern not supported {:?}-{:?} not supported for {:?}.", + input_ty, output_ty, op + ), + }, + Opcode::ExtendedPairwiseAddUnsigned => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + static MUL_CONST: [u8; 16] = [0x01; 16]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16)); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddubsw, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + } + (types::I16X8, types::I32X4) => { + static PXOR_CONST: [u8; 16] = [ + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, + ]; + let pxor_const = + ctx.use_constant(VCodeConstantData::WellKnown(&PXOR_CONST)); + let pxor_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + pxor_const, + pxor_const_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(pxor_const_reg.to_reg()), + dst, + )); + + static MADD_CONST: [u8; 16] = [ + 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, + ]; + let madd_const = + ctx.use_constant(VCodeConstantData::WellKnown(&MADD_CONST)); + let madd_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + madd_const, + madd_const_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddwd, + RegMem::reg(madd_const_reg.to_reg()), + dst, + )); + + static ADDD_CONST2: [u8; 16] = [ + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, + ]; + let addd_const2 = + ctx.use_constant(VCodeConstantData::WellKnown(&ADDD_CONST2)); + let addd_const2_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + addd_const2, + addd_const2_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddd, + RegMem::reg(addd_const2_reg.to_reg()), + dst, + )); + } + _ => unreachable!( + "Type pattern not supported {:?}-{:?} not supported for {:?}.", + input_ty, output_ty, op + ), + }, + _ => unreachable!("{:?} not supported.", op), + } + */ + } Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => { let input_ty = ctx.input_ty(insn, 0); let output_ty = ctx.output_ty(insn, 0); diff --git a/cranelift/codegen/src/preopt.serialized b/cranelift/codegen/src/preopt.serialized index 95e9f3e2b9e437ad3c06c90b0d9c65279282c39b..10e3d5c36eda6f685ab53df7ef77d4e978f68c84 100644 GIT binary patch delta 95 zcmZqIZr9$R$-;PR@&Q&`Ao&kW+OUDxRbX-lm`q@eXS@kka}%uQ=46nXn_x9JCxg`7 ZoNUK0KT&{Vay={8}bd;qPFA+7)b delta 95 zcmZqIZr9$R$-;PJ@&Q&`Ao&kW+OUDxRbX-lm`q@eXS@zpa~-Va`ecxr>tHq4Cxg^n ZpKQl3KT&{Vay={8}bd;q9_A(j9D diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 9c0763872bbf..0e60ed089a22 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -630,6 +630,8 @@ where Opcode::Fence => unimplemented!("Fence"), Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"), Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"), + Opcode::ExtendedPairwiseAddSigned => unimplemented!("ExtendedPairwiseAddSigned"), + Opcode::ExtendedPairwiseAddUnsigned => unimplemented!("ExtendedPairwiseAddUnsigned"), // TODO: these instructions should be removed once the new backend makes these obsolete // (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 864cb10f9d66..21a9e9eef05e 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1879,6 +1879,22 @@ pub fn translate_operator( let a = pop1_with_bitcast(state, I32X4, builder); state.push1(builder.ins().uwiden_high(a)) } + Operator::I16x8ExtAddPairwiseI8x16S => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().extended_pairwise_add_signed(a)) + } + Operator::I32x4ExtAddPairwiseI16x8S => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().extended_pairwise_add_signed(a)) + } + Operator::I16x8ExtAddPairwiseI8x16U => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().extended_pairwise_add_unsigned(a)) + } + Operator::I32x4ExtAddPairwiseI16x8U => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().extended_pairwise_add_unsigned(a)) + } Operator::F32x4Ceil | Operator::F64x2Ceil => { // This is something of a misuse of `type_of`, because that produces the return type // of `op`. In this case we want the arg type, but we know it's the same as the @@ -1982,12 +1998,6 @@ pub fn translate_operator( let b_high = builder.ins().uwiden_high(b); state.push1(builder.ins().imul(a_high, b_high)); } - Operator::I16x8ExtAddPairwiseI8x16S - | Operator::I16x8ExtAddPairwiseI8x16U - | Operator::I32x4ExtAddPairwiseI16x8S - | Operator::I32x4ExtAddPairwiseI16x8U => { - return Err(wasm_unsupported!("proposed simd operator {:?}", op)); - } Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { return Err(wasm_unsupported!("proposed tail-call operator {:?}", op)); }