bytecodealliance · abrown · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020
@@ -197,8 +197,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", _) if target.contains("aarch64") => return true,
 
             ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
-            ("simd", "simd_load") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
-            ("simd", "simd_splat") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
 
             // Still working on implementing these. See #929.
             ("reference_types", "global")

@@ -1704,6 +1704,7 @@ fn define_simd(
     let x86_ptest = x86.by_name("x86_ptest");
     let x86_punpckh = x86.by_name("x86_punpckh");
     let x86_punpckl = x86.by_name("x86_punpckl");
+    let x86_vcvtps2udq = x86.by_name("x86_vcvtps2udq");
     let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");
 
     // Shorthands for recipes.
@@ -1951,6 +1952,12 @@ fn define_simd(
             Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
         );
 
+        e.enc_32_64_maybe_isap(
+            x86_vcvtps2udq,
+            rec_evex_reg_rm_128.opcodes(&VCVTPS2UDQ),
+            Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
+        );
+
         e.enc_both_inferred(
             x86_cvtt2si
                 .bind(vector(I32, sse_vector_size))

@@ -167,8 +167,29 @@ pub(crate) fn define(
             r#"
         Convert unsigned integer to floating point.
 
-        Convert packed doubleword unsigned integers to packed single-precision floating-point 
-        values. This instruction does not trap.
+        Convert packed doubleword unsigned integers to packed single-precision floating-point
+        values. When a conversion is inexact, the value returned is rounded according to the
+        rounding control bits in the MXCSR register or the embedded rounding control bits. This
+        instruction does not trap.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", f32x4);
+    let a = &Operand::new("a", i32x4);
+    ig.push(
+        Inst::new(
+            "x86_vcvtps2udq",
+            r#"
+        Convert floating point to unsigned integer.
+
+        Convert packed single-precision floating-point to packed doubleword unsigned integers 
+        values. When a conversion is inexact, the value returned is rounded according to the
+        rounding control bits in the MXCSR register or the embedded rounding control bits. This
+        instruction does not trap.
         "#,
             &formats.unary,
         )

@@ -383,6 +383,7 @@ fn define_simd(
     let fcmp = insts.by_name("fcmp");
     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
     let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
+    let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
     let fmax = insts.by_name("fmax");
     let fmin = insts.by_name("fmin");
     let fneg = insts.by_name("fneg");
@@ -797,4 +798,5 @@ fn define_simd(
 
     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
     narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
+    narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
 }
@@ -47,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
     x86_32.legalize_type(F32, x86_expand);
     x86_32.legalize_type(F64, x86_expand);
+    x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
     x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
     x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
 
@@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
     x86_64.legalize_type(F32, x86_expand);
     x86_64.legalize_type(F64, x86_expand);
+    x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
     x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
     x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
 

@@ -684,6 +684,12 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
 /// Raise invalid opcode instruction.
 pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];
 
+/// Convert four packed single precision floating-point values from xmm2/m128/m32bcst to four packed
+/// unsigned doubleword values in xmm1 using truncation subject to writemask k1. Rounding behavior
+/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode (AVX512VL,
+/// AVX512F).
+pub static VCVTPS2UDQ: [u8; 2] = [0x0f, 0x79];
+
 /// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
 /// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
 /// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode

@@ -2063,6 +2063,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Packss
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
+        | Opcode::X86Vcvtps2udq
         | Opcode::X86Vcvtudq2ps
         | Opcode::X86ElfTlsGetAddr
         | Opcode::X86MachoTlsGetAddr => {

@@ -1313,6 +1313,89 @@ fn expand_fcvt_to_uint_sat(
     cfg.recompute_block(pos.func, done);
 }
 
+// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4.
+static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [
+    0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f,
+];
+
+/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes
+/// using VCVTPS2UDQ if AVX512 is available or to a longer sequence of NaN quieting and truncation
+/// otherwise. This logic is separate from [expand_fcvt_to_uint_sat] above (the scalar version),
+/// only due to how the transform groups are set up; TODO if we change the SIMD legalization groups,
+/// then this logic could be merged into [expand_fcvt_to_uint_sat] (see
+/// https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_to_uint_sat_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtToUintSat,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == I32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
+                // If certain AVX512 features are available, we can emit a single instruction.
+                pos.func.dfg.replace(inst).x86_vcvtps2udq(arg);
+            } else {
+                // Otherwise, we must both quiet any NaNs--setting that lane to 0--and saturate any
+                // lanes that might overflow during conversion to the highest/lowest integer
+                // allowed in that lane.
+                let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into());
+                let max_signed_constant = pos
+                    .func
+                    .dfg
+                    .constants
+                    .insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into());
+                let zeroes = pos.ins().vconst(F32X4, zeroes_constant);
+                let max_signed = pos.ins().vconst(F32X4, max_signed_constant);
+                // Clamp the input to 0 for negative floating point numbers. TODO we need to
+                // convert NaNs to 0 but this doesn't do that?
+                let ge_zero = pos.ins().x86_fmax(arg, zeroes);
+                // Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert.
+                // For floating point numbers above this, CVTTPS2DQ returns the undefined value
+                // 0x80000000.
+                let minus_max_signed = pos.ins().fsub(ge_zero, max_signed);
+                let le_max_signed =
+                    pos.ins()
+                        .fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed);
+                // Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0.
+                // These lanes have the MSB set to 1 after the XOR. We are trying to calculate a
+                // valid, in-range addend.
+                let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed);
+                let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed);
+                let difference = pos
+                    .ins()
+                    .bxor(minus_max_signed_as_int, le_max_signed_as_int);
+                // Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified
+                // previously (MSB set to 1).
+                let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes);
+                let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int);
+                // Convert the original clamped number to an integer and add back in the addend
+                // (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these).
+                let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero);
+                pos.func.dfg.replace(inst).iadd(converted, addend);
+            }
+        } else {
+            unreachable!(
+                "{} should not be legalized in expand_fcvt_to_uint_sat_vector",
+                pos.func.dfg.display_inst(inst, None)
+            )
+        }
+    }
+}
+
 /// Convert shuffle instructions.
 fn convert_shuffle(
     inst: ir::Inst,

@@ -7,3 +7,9 @@ block0(v0: i32x4 [%xmm2]):
 [-, %xmm6]  v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2
     return
 }
+
+function %fcvt_to_uint(f32x4) {
+block0(v0: f32x4 [%xmm2]):
+[-, %xmm6]  v1 = x86_vcvtps2udq v0 ; bin: 62 f1 7c 08 79 f2
+    return
+}
@@ -8,3 +8,10 @@ block0(v0:i32x4):
     ; check: v1 = x86_vcvtudq2ps v0
     return v1
 }
+
+function %fcvt_to_uint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_uint_sat.i32x4 v0
+    ; check: v1 = x86_vcvtps2udq v0
+    return v1
+}
@@ -32,3 +32,23 @@ block0(v0:f32x4):
     ; nextln: v1 = bxor v7, v9
     return v1
 }
+
+function %fcvt_to_uint_sat(f32x4) -> i32x4 {
+; check: const0 = 0x00000000000000000000000000000000
+; nextln: const1 = 0x4f0000004f0000004f0000004f000000
+block0(v0:f32x4):
+    v1 = fcvt_to_uint_sat.i32x4 v0
+    ; check: v2 = vconst.f32x4 const0
+    ; nextln: v3 = vconst.f32x4 const1
+    ; nextln: v4 = x86_fmax v0, v2
+    ; nextln: v5 = fsub v4, v3
+    ; nextln: v6 = fcmp le v3, v5
+    ; nextln: v7 = x86_cvtt2si.i32x4 v5
+    ; nextln: v8 = raw_bitcast.i32x4 v6
+    ; nextln: v9 = bxor v7, v8
+    ; nextln: v10 = raw_bitcast.i32x4 v2
+    ; nextln: v11 = x86_pmaxs v9, v10
+    ; nextln: v12 = x86_cvtt2si.i32x4 v4
+    ; nextln: v1 = iadd v12, v11
+    return v1
+}
@@ -28,3 +28,12 @@ block0(v0:f32x4):
 }
 ; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
 ; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
+
+function %fcvt_to_uint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_uint_sat.i32x4 v0
+    return v1
+}
+; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
+; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
+; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
@@ -1559,8 +1559,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, F32X4, builder);
             state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
         }
-        Operator::I32x4TruncSatF32x4U
-        | Operator::I8x16NarrowI16x8S { .. }
+        Operator::I32x4TruncSatF32x4U => {
+            let a = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a))
+        }
+        Operator::I8x16NarrowI16x8S { .. }
         | Operator::I8x16NarrowI16x8U { .. }
         | Operator::I16x8NarrowI32x4S { .. }
         | Operator::I16x8NarrowI32x4U { .. }