From 7dad5e249678d6d0ccd8a8f93e02ad04a13b5b7d Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 3 Jun 2020 12:04:32 -0700 Subject: [PATCH 1/4] Add x86_vcvtps2udq instruction This allows x86 machines with the right AVX features to lower fcvt_to_uint_sat.i32x4 to a single instruction. --- .../codegen/meta/src/isa/x86/encodings.rs | 7 ++++++ .../codegen/meta/src/isa/x86/instructions.rs | 25 +++++++++++++++++-- cranelift/codegen/meta/src/isa/x86/opcodes.rs | 6 +++++ .../codegen/src/isa/aarch64/lower_inst.rs | 1 + .../x86/simd-avx512-conversion-binemit.clif | 6 +++++ 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 303b1bfaebb5..e3d5340cdf74 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1704,6 +1704,7 @@ fn define_simd( let x86_ptest = x86.by_name("x86_ptest"); let x86_punpckh = x86.by_name("x86_punpckh"); let x86_punpckl = x86.by_name("x86_punpckl"); + let x86_vcvtps2udq = x86.by_name("x86_vcvtps2udq"); let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps"); // Shorthands for recipes. @@ -1951,6 +1952,12 @@ fn define_simd( Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F ); + e.enc_32_64_maybe_isap( + x86_vcvtps2udq, + rec_evex_reg_rm_128.opcodes(&VCVTPS2UDQ), + Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F + ); + e.enc_both_inferred( x86_cvtt2si .bind(vector(I32, sse_vector_size)) diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 4afbc8874702..f93df16564ef 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -167,8 +167,29 @@ pub(crate) fn define( r#" Convert unsigned integer to floating point. - Convert packed doubleword unsigned integers to packed single-precision floating-point - values. This instruction does not trap. + Convert packed doubleword unsigned integers to packed single-precision floating-point + values. When a conversion is inexact, the value returned is rounded according to the + rounding control bits in the MXCSR register or the embedded rounding control bits. This + instruction does not trap. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", f32x4); + let a = &Operand::new("a", i32x4); + ig.push( + Inst::new( + "x86_vcvtps2udq", + r#" + Convert floating point to unsigned integer. + + Convert packed single-precision floating-point to packed doubleword unsigned integers + values. When a conversion is inexact, the value returned is rounded according to the + rounding control bits in the MXCSR register or the embedded rounding control bits. This + instruction does not trap. "#, &formats.unary, ) diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index f7f7480f9b7d..5bf2f19bc40e 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -684,6 +684,12 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e]; /// Raise invalid opcode instruction. pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b]; +/// Convert four packed single precision floating-point values from xmm2/m128/m32bcst to four packed +/// unsigned doubleword values in xmm1 using truncation subject to writemask k1. Rounding behavior +/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode (AVX512VL, +/// AVX512F). +pub static VCVTPS2UDQ: [u8; 2] = [0x0f, 0x79]; + /// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed /// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior /// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 82eb35f13fb8..46a00bc9d793 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2063,6 +2063,7 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::X86Packss | Opcode::X86Punpckh | Opcode::X86Punpckl + | Opcode::X86Vcvtps2udq | Opcode::X86Vcvtudq2ps | Opcode::X86ElfTlsGetAddr | Opcode::X86MachoTlsGetAddr => { diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif index 37abef0e61c9..ca7cf9affa3d 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif @@ -7,3 +7,9 @@ block0(v0: i32x4 [%xmm2]): [-, %xmm6] v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2 return } + +function %fcvt_to_uint(f32x4) { +block0(v0: f32x4 [%xmm2]): +[-, %xmm6] v1 = x86_vcvtps2udq v0 ; bin: 62 f1 7c 08 79 f2 + return +} From 192ea42b1129c432429d7ac7c97f151441888869 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 3 Jun 2020 12:18:02 -0700 Subject: [PATCH 2/4] Add x86 legalization for fcvt_to_uint_sat.i32x4 This converts an `f32x4` into an `i32x4` (unsigned) with some rounding either by using an AVX512VL/F instruction--VCVTPS2UDQ--or a long sequence of SSE4.1 compatible instructions. --- .../codegen/meta/src/isa/x86/legalize.rs | 2 + cranelift/codegen/meta/src/isa/x86/mod.rs | 2 + cranelift/codegen/src/isa/x86/enc_tables.rs | 83 +++++++++++++++++++ .../x86/simd-avx512-conversion-legalize.clif | 7 ++ .../isa/x86/simd-conversion-legalize.clif | 20 +++++ .../isa/x86/simd-conversion-run.clif | 9 ++ 6 files changed, 123 insertions(+) diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 51453322e9ba..92522794c32e 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -383,6 +383,7 @@ fn define_simd( let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat"); + let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat"); let fmax = insts.by_name("fmax"); let fmin = insts.by_name("fmin"); let fneg = insts.by_name("fneg"); @@ -797,4 +798,5 @@ fn define_simd( narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); + narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector"); } diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs index 8d2e33be732c..56f35770a85d 100644 --- a/cranelift/codegen/meta/src/isa/x86/mod.rs +++ b/cranelift/codegen/meta/src/isa/x86/mod.rs @@ -47,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_32.legalize_value_type(ReferenceType(R32), x86_expand); x86_32.legalize_type(F32, x86_expand); x86_32.legalize_type(F64, x86_expand); + x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx); x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); @@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_64.legalize_value_type(ReferenceType(R64), x86_expand); x86_64.legalize_type(F32, x86_expand); x86_64.legalize_type(F64, x86_expand); + x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx); x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index b64140cfa896..bd4db3a23a90 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -1313,6 +1313,89 @@ fn expand_fcvt_to_uint_sat( cfg.recompute_block(pos.func, done); } +// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4. +static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [ + 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, +]; + +/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes +/// using VCVTPS2UDQ if AVX512 is available or to a longer sequence of NaN quieting and truncation +/// otherwise. This logic is separate from [expand_fcvt_to_uint_sat] above (the scalar version), +/// only due to how the transform groups are set up; TODO if we change the SIMD legalization groups, +/// then this logic could be merged into [expand_fcvt_to_uint_sat] (see +/// https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_to_uint_sat_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToUintSat, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == I32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); + let x86_isa = isa + .as_any() + .downcast_ref::() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() { + // If certain AVX512 features are available, we can emit a single instruction. + pos.func.dfg.replace(inst).x86_vcvtps2udq(arg); + } else { + // Otherwise, we must both quiet any NaNs--setting that lane to 0--and saturate any + // lanes that might overflow during conversion to the highest/lowest integer + // allowed in that lane. + let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into()); + let max_signed_constant = pos + .func + .dfg + .constants + .insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into()); + let zeroes = pos.ins().vconst(F32X4, zeroes_constant); + let max_signed = pos.ins().vconst(F32X4, max_signed_constant); + // Clamp the input to 0 for negative floating point numbers. TODO we need to + // convert NaNs to 0 but this doesn't do that? + let ge_zero = pos.ins().x86_fmax(arg, zeroes); + // Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert. + // For floating point numbers above this, CVTTPS2DQ returns the undefined value + // 0x80000000. + let minus_max_signed = pos.ins().fsub(ge_zero, max_signed); + let le_max_signed = + pos.ins() + .fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed); + // Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0. + // These lanes have the MSB set to 1 after the XOR. We are trying to calculate a + // valid, in-range addend. + let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed); + let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed); + let difference = pos + .ins() + .bxor(minus_max_signed_as_int, le_max_signed_as_int); + // Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified + // previously (MSB set to 1). + let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes); + let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int); + // Convert the original clamped number to an integer and add back in the addend + // (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these). + let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero); + pos.func.dfg.replace(inst).iadd(converted, addend); + } + } else { + unreachable!( + "{} should not be legalized in expand_fcvt_to_uint_sat_vector", + pos.func.dfg.display_inst(inst, None) + ) + } + } +} + /// Convert shuffle instructions. fn convert_shuffle( inst: ir::Inst, diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif index 78dc1cf2200e..a3fc5f6be56d 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif @@ -8,3 +8,10 @@ block0(v0:i32x4): ; check: v1 = x86_vcvtudq2ps v0 return v1 } + +function %fcvt_to_uint_sat(f32x4) -> i32x4 { +block0(v0:f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + ; check: v1 = x86_vcvtps2udq v0 + return v1 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif index 912c34d0fcf2..ccea16de2c20 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif @@ -32,3 +32,23 @@ block0(v0:f32x4): ; nextln: v1 = bxor v7, v9 return v1 } + +function %fcvt_to_uint_sat(f32x4) -> i32x4 { +; check: const0 = 0x00000000000000000000000000000000 +; nextln: const1 = 0x4f0000004f0000004f0000004f000000 +block0(v0:f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + ; check: v2 = vconst.f32x4 const0 + ; nextln: v3 = vconst.f32x4 const1 + ; nextln: v4 = x86_fmax v0, v2 + ; nextln: v5 = fsub v4, v3 + ; nextln: v6 = fcmp le v3, v5 + ; nextln: v7 = x86_cvtt2si.i32x4 v5 + ; nextln: v8 = raw_bitcast.i32x4 v6 + ; nextln: v9 = bxor v7, v8 + ; nextln: v10 = raw_bitcast.i32x4 v2 + ; nextln: v11 = x86_pmaxs v9, v10 + ; nextln: v12 = x86_cvtt2si.i32x4 v4 + ; nextln: v1 = iadd v12, v11 + return v1 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif index 8764bceabc11..0ca5e2022b5d 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif @@ -28,3 +28,12 @@ block0(v0:f32x4): } ; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF] ; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000] + +function %fcvt_to_uint_sat(f32x4) -> i32x4 { +block0(v0:f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} +; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF] +; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0] +; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0] From 519e63ed1edb520dc2ed9c0b31eec21c990176cf Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 3 Jun 2020 12:19:04 -0700 Subject: [PATCH 3/4] Translate Wasm's i32x4.trunc_sat_f32x4_u to Cranelift's fcvt_to_uint_sat.i32x4 --- cranelift/wasm/src/code_translator.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 023961035b6e..0a9d2709d675 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1559,8 +1559,11 @@ pub fn translate_operator( let a = pop1_with_bitcast(state, F32X4, builder); state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a)) } - Operator::I32x4TruncSatF32x4U - | Operator::I8x16NarrowI16x8S { .. } + Operator::I32x4TruncSatF32x4U => { + let a = pop1_with_bitcast(state, F32X4, builder); + state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a)) + } + Operator::I8x16NarrowI16x8S { .. } | Operator::I8x16NarrowI16x8U { .. } | Operator::I16x8NarrowI32x4S { .. } | Operator::I16x8NarrowI32x4U { .. } From 229ed9da85fec59c031b425b6874a9711b511db1 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 3 Jun 2020 12:20:44 -0700 Subject: [PATCH 4/4] Enable more SIMD spec tests --- build.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/build.rs b/build.rs index fa89812ed91b..5fe36ad302b2 100644 --- a/build.rs +++ b/build.rs @@ -197,8 +197,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", _) if target.contains("aarch64") => return true, ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S - ("simd", "simd_load") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S - ("simd", "simd_splat") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S // Still working on implementing these. See #929. ("reference_types", "global")