Fold fcvt_low_from_uinit into previously existing clif instructions

bytecodealliance · Jul 8, 2021 · 81b4245 · 81b4245
1 parent 18ea7ea
commit 81b4245
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 79 deletions.
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4457,28 +4457,6 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
-    ig.push(
-        Inst::new(
-            "fcvt_low_from_uint",
-            r#"
-
-        Converts packed unsigned 32-bit integers to packed double precision floating point.
-
-        Considering only the low half of the register, each lane in `x` is interpreted as a
-        unsigned 32-bit integer that is then converted to a double precision float. This
-        instruction differs from fcvt_from_uint in that it converts half the number of lanes
-        which are converted to occupy twice the number of bits. No rounding should be needed
-        for the resulting float.
-
-        The result type will have half the number of vector lanes as the input.
-
-        "#,
-            &formats.unary,
-        )
-        .operands_in(vec![x])
-        .operands_out(vec![a]),
-    );
-
     let WideInt = &TypeVar::new(
         "WideInt",
         "An integer type with lanes from `i16` upwards",

diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -3557,7 +3557,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::ConstAddr
         | Opcode::FcvtLowFromSint
-        | Opcode::FcvtLowFromUint
         | Opcode::Fvdemote
         | Opcode::FvpromoteLow
         | Opcode::Vconcat

diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -2867,7 +2867,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::UwidenHigh
         | Opcode::WideningPairwiseDotProductS
         | Opcode::SqmulRoundSat
-        | Opcode::FcvtLowFromUint
         | Opcode::FvpromoteLow
         | Opcode::Fvdemote => {
             // TODO

diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4154,58 +4154,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 dst,
             ));
         }
-        Opcode::FcvtLowFromUint => {
-            // Algorithm uses unpcklps to help create a float that is equivalent
-            // 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
-            // every value of the mantissa represents a corresponding uint32 number.
-            // When we subtract 0x1.0p52 we are left with double(src).
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-
-            ctx.emit(Inst::gen_move(dst, src, types::I32X4));
-
-            static UINT_MASK: [u8; 16] = [
-                0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00,
-            ];
-
-            let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
-
-            ctx.emit(Inst::xmm_load_const(
-                uint_mask_const,
-                uint_mask,
-                types::I32X4,
-            ));
-
-            // Creates 0x1.0p52 + double(src)
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Unpcklps,
-                RegMem::from(uint_mask),
-                dst,
-            ));
-
-            static UINT_MASK_HIGH: [u8; 16] = [
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x30, 0x43,
-            ];
-
-            let uint_mask_high_const =
-                ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
-            let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-            ctx.emit(Inst::xmm_load_const(
-                uint_mask_high_const,
-                uint_mask_high,
-                types::I32X4,
-            ));
-
-            // 0x1.0p52 + double(src) - 0x1.0p52
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Subpd,
-                RegMem::from(uint_mask_high),
-                dst,
-            ));
-        }
         Opcode::FcvtFromUint => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
@@ -4253,6 +4201,67 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
                 };
+            } else if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
+                let uwiden_input = InsnInput {
+                    insn: uwiden,
+                    input: 0,
+                };
+                let src = put_input_in_reg(ctx, uwiden_input);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let input_ty = ctx.input_ty(uwiden, 0);
+                let output_ty = ctx.output_ty(insn, 0);
+
+                // Matches_input further obfuscates which Wasm instruction this is ultimately
+                // lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
+                debug_assert!(input_ty == types::I32X4 || output_ty == types::F64X2);
+
+                // Algorithm uses unpcklps to help create a float that is equivalent
+                // 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
+                // every value of the mantissa represents a corresponding uint32 number.
+                // When we subtract 0x1.0p52 we are left with double(src).
+                let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src, types::I32X4));
+
+                static UINT_MASK: [u8; 16] = [
+                    0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
+                    0x00, 0x00, 0x00,
+                ];
+
+                let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
+
+                ctx.emit(Inst::xmm_load_const(
+                    uint_mask_const,
+                    uint_mask,
+                    types::I32X4,
+                ));
+
+                // Creates 0x1.0p52 + double(src)
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Unpcklps,
+                    RegMem::from(uint_mask),
+                    dst,
+                ));
+
+                static UINT_MASK_HIGH: [u8; 16] = [
+                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
+                    0x00, 0x30, 0x43,
+                ];
+
+                let uint_mask_high_const =
+                    ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
+                let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+                ctx.emit(Inst::xmm_load_const(
+                    uint_mask_high_const,
+                    uint_mask_high,
+                    types::I32X4,
+                ));
+
+                // 0x1.0p52 + double(src) - 0x1.0p52
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Subpd,
+                    RegMem::from(uint_mask_high),
+                    dst,
+                ));
             } else {
                 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
                 let src = put_input_in_reg(ctx, inputs[0]);
@@ -4595,7 +4604,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         (types::I16X8, types::I32X4) => {
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
                         }
-                        _ => unreachable!(),
+                        //(types::I32X4, types::I64X2) => {
+                        //    ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::reg(src), dst));
+                        //}
+                        _ => unreachable!(
+                            "In UwidenLow: input_ty {:?}, output_ty {:?}",
+                            input_ty, output_ty
+                        ),
                     },
                     Opcode::UwidenHigh => match (input_ty, output_ty) {
                         (types::I8X16, types::I16X8) => {

diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
@@ -565,7 +565,6 @@ where
         Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
         Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
         Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
-        Opcode::FcvtLowFromUint => unimplemented!("FcvtLowFromUint"),
         Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
         Opcode::Fvdemote => unimplemented!("Fvdemote"),
         Opcode::Isplit => unimplemented!("Isplit"),

diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
@@ -1780,7 +1780,8 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::F64x2ConvertLowI32x4U => {
             let a = pop1_with_bitcast(state, I32X4, builder);
-            state.push1(builder.ins().fcvt_low_from_uint(F64X2, a));
+            let widened_a = builder.ins().uwiden_low(a);
+            state.push1(builder.ins().fcvt_from_uint(F64X2, widened_a));
         }
         Operator::F64x2PromoteLowF32x4 => {
             let a = pop1_with_bitcast(state, F32X4, builder);