diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index bc6771d35fa9..5630a97a5faa 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1658,6 +1658,10 @@ (rule (x64_movdqu from) (xmm_unary_rm_r (SseOpcode.Movdqu) from)) +(decl x64_movapd (XmmMem) Xmm) +(rule (x64_movapd src) + (xmm_unary_rm_r (SseOpcode.Movapd) src)) + (decl x64_pmovsxbw (XmmMem) Xmm) (rule (x64_pmovsxbw from) (xmm_unary_rm_r (SseOpcode.Pmovsxbw) from)) @@ -2272,6 +2276,11 @@ (rule (x64_punpcklwd src1 src2) (xmm_rm_r $I16X8 (SseOpcode.Punpcklwd) src1 src2)) +;; Helper for creating `unpcklps` instructions. +(decl x64_unpcklps (Xmm XmmMem) Xmm) +(rule (x64_unpcklps src1 src2) + (xmm_rm_r $I16X8 (SseOpcode.Unpcklps) src1 src2)) + ;; Helper for creating `andnps` instructions. (decl x64_andnps (Xmm XmmMem) Xmm) (rule (x64_andnps src1 src2) @@ -2624,6 +2633,11 @@ (_ Unit (emit (MInst.XmmUnaryRmREvex op src dst)))) dst)) +;; Helper for creating `vcvtudq2ps` instructions. +(decl x64_vcvtudq2ps (XmmMem) Xmm) +(rule (x64_vcvtudq2ps src) + (xmm_unary_rm_r_evex (Avx512Opcode.Vcvtudq2ps) src)) + ;; Helper for creating `vpabsq` instructions. (decl x64_vpabsq (XmmMem) Xmm) (rule (x64_vpabsq src) @@ -3014,6 +3028,23 @@ (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size)))) dst)) +(decl cvt_u64_to_float_seq (Type Gpr) Xmm) +(rule (cvt_u64_to_float_seq ty src) + (let ((size OperandSize (raw_operand_size_of_type ty)) + (src_copy WritableGpr (temp_writable_gpr)) + (dst WritableXmm (temp_writable_xmm)) + (tmp_gpr1 WritableGpr (temp_writable_gpr)) + (tmp_gpr2 WritableGpr (temp_writable_gpr)) + (_ Unit (emit (gen_move $I64 src_copy src))) + (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2)))) + dst)) + +(decl fcvt_uint_mask_const () VCodeConstant) +(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const) + +(decl fcvt_uint_mask_high_const () VCodeConstant) +(extern constructor fcvt_uint_mask_high_const fcvt_uint_mask_high_const) + ;; Helpers for creating `pcmpeq*` instructions. (decl x64_pcmpeq (Type Xmm XmmMem) Xmm) (rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y)) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 088c0bd15f39..c8e44d3758e4 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -26,6 +26,16 @@ impl Inst { dst: WritableGpr::from_writable_reg(src).unwrap(), } } + + fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable) -> Inst { + src.assert_regclass_is(RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmUnaryRmREvex { + op, + src: XmmMem::new(src).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } } #[test] diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 4b2a02a02c01..267d664bed2e 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -307,16 +307,6 @@ impl Inst { } } - pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable) -> Inst { - src.assert_regclass_is(RegClass::Float); - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmUnaryRmREvex { - op, - src: XmmMem::new(src).unwrap(), - dst: WritableXmm::from_writable_reg(dst).unwrap(), - } - } - pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable) -> Self { src.assert_regclass_is(RegClass::Float); debug_assert!(dst.to_reg().class() == RegClass::Float); @@ -417,27 +407,6 @@ impl Inst { Inst::XmmCmpRmR { op, src, dst } } - pub(crate) fn cvt_u64_to_float_seq( - dst_size: OperandSize, - src: Writable, - tmp_gpr1: Writable, - tmp_gpr2: Writable, - dst: Writable, - ) -> Inst { - debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(src.to_reg().class() == RegClass::Int); - debug_assert!(tmp_gpr1.to_reg().class() == RegClass::Int); - debug_assert!(tmp_gpr2.to_reg().class() == RegClass::Int); - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::CvtUint64ToFloatSeq { - src: WritableGpr::from_writable_reg(src).unwrap(), - dst: WritableXmm::from_writable_reg(dst).unwrap(), - tmp_gpr1: WritableGpr::from_writable_reg(tmp_gpr1).unwrap(), - tmp_gpr2: WritableGpr::from_writable_reg(tmp_gpr2).unwrap(), - dst_size, - } - } - pub(crate) fn cvt_float_to_sint_seq( src_size: OperandSize, dst_size: OperandSize, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 01915b35a900..e6fd35c45a82 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3013,3 +3013,76 @@ (rule (lower (fcvt_low_from_sint a @ (value_type ty))) (x64_cvtdq2pd ty a)) + +;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) + (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + +(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) + (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + +(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64)))) + (cvt_u64_to_float_seq ty val)) + +;; Algorithm uses unpcklps to help create a float that is equivalent +;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent +;; every value of the mantissa represents a corresponding uint32 number. +;; When we subtract 0x1.0p52 we are left with double(src). +(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) + (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const))) + (res Xmm (x64_unpcklps val uint_mask)) + (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const)))) + (x64_subpd res uint_mask_high))) + +;; When AVX512VL and AVX512F are available, +;; `fcvt_from_uint` can be lowered to a single instruction. +;; +;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4, +;; as it doesn't require either of the avx512 extensions to be enabled. +(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4) + (fcvt_from_uint src))) + (x64_vcvtudq2ps src)) + +;; Converting packed unsigned integers to packed floats +;; requires a few steps. There is no single instruction +;; lowering for converting unsigned floats but there is for +;; converting packed signed integers to float (cvtdq2ps). In +;; the steps below we isolate the upper half (16 bits) and +;; lower half (16 bits) of each lane and then we convert +;; each half separately using cvtdq2ps meant for signed +;; integers. In order for this to work for the upper half +;; bits we must shift right by 1 (divide by 2) these bits in +;; order to ensure the most significant bit is 0 not signed, +;; and then after the conversion we double the value. +;; Finally we add the converted values where addition will +;; correctly round. +;; +;; Sequence: +;; -> A = 0xffffffff +;; -> Ah = 0xffff0000 +;; -> Al = 0x0000ffff +;; -> Convert(Al) // Convert int to float +;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed +;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift +;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. +;; -> dst = Ah + Al // Add the two floats together +(rule (lower (has_type $F32X4 (fcvt_from_uint a))) + (let (;; get the low 16 bits + (a_lo Xmm (x64_pslld a (RegMemImm.Imm 16))) + (a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16))) + + ;; get the high 16 bits + (a_hi Xmm (x64_psubd a a_lo)) + + ;; convert the low 16 bits + (a_lo Xmm (x64_cvtdq2ps a_lo)) + + ;; shift the high bits by 1, convert, and double to get the correct + ;; value + (a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1))) + (a_hi Xmm (x64_cvtdq2ps a_hi)) + (a_hi Xmm (x64_addps a_hi a_hi))) + + ;; add together the two converted values + (x64_addps a_hi a_lo))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index ebf2eca43572..0ad745c17b8f 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -166,57 +166,6 @@ fn input_to_reg_mem>(ctx: &mut C, spec: InsnInput) -> RegM ) } -/// An extension specification for `extend_input_to_reg`. -#[derive(Clone, Copy)] -enum ExtSpec { - #[allow(dead_code)] - ZeroExtendTo32, - ZeroExtendTo64, - #[allow(dead_code)] - SignExtendTo32, - #[allow(dead_code)] // not used just yet but may be used in the future! - SignExtendTo64, -} - -/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if -/// required. (This obviously causes side-effects.) -fn extend_input_to_reg>( - ctx: &mut C, - spec: InsnInput, - ext_spec: ExtSpec, -) -> Reg { - let requested_size = match ext_spec { - ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32, - ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64, - }; - let input_size = ctx.input_ty(spec.insn, spec.input).bits(); - - let requested_ty = if requested_size == 32 { - types::I32 - } else { - types::I64 - }; - - let ext_mode = match (input_size, requested_size) { - (a, b) if a == b => return put_input_in_reg(ctx, spec), - (1, 8) => return put_input_in_reg(ctx, spec), - (a, b) => ExtMode::new(a.try_into().unwrap(), b.try_into().unwrap()) - .unwrap_or_else(|| panic!("invalid extension: {} -> {}", a, b)), - }; - - let src = input_to_reg_mem(ctx, spec); - let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap(); - match ext_spec { - ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => { - ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)) - } - ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => { - ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)) - } - } - dst.to_reg() -} - fn input_to_imm>(ctx: &mut C, spec: InsnInput) -> Option { ctx.get_input_as_source_or_const(spec.insn, spec.input) .constant @@ -629,207 +578,11 @@ fn lower_insn_to_regs>( | Opcode::Selectif | Opcode::SelectifSpectreGuard | Opcode::FcvtFromSint - | Opcode::FcvtLowFromSint => { + | Opcode::FcvtLowFromSint + | Opcode::FcvtFromUint => { implemented_in_isle(ctx); } - Opcode::FcvtFromUint => { - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let ty = ty.unwrap(); - let input_ty = ctx.input_ty(insn, 0); - let output_ty = ctx.output_ty(insn, 0); - - if !ty.is_vector() { - match input_ty { - types::I8 | types::I16 | types::I32 => { - // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend + - // do a signed conversion (which won't overflow). - let opcode = if ty == types::F32 { - SseOpcode::Cvtsi2ss - } else { - assert_eq!(ty, types::F64); - SseOpcode::Cvtsi2sd - }; - - let src = RegMem::reg(extend_input_to_reg( - ctx, - inputs[0], - ExtSpec::ZeroExtendTo64, - )); - ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst)); - } - - types::I64 => { - let src = put_input_in_reg(ctx, inputs[0]); - - let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - ctx.emit(Inst::gen_move(src_copy, src, types::I64)); - - let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - ctx.emit(Inst::cvt_u64_to_float_seq( - if ty == types::F64 { - OperandSize::Size64 - } else { - OperandSize::Size32 - }, - src_copy, - tmp_gpr1, - tmp_gpr2, - dst, - )); - } - _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), - }; - } else if output_ty == types::F64X2 { - if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) { - let uwiden_input = InsnInput { - insn: uwiden, - input: 0, - }; - let src = put_input_in_reg(ctx, uwiden_input); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let input_ty = ctx.input_ty(uwiden, 0); - - // Matches_input further obfuscates which Wasm instruction this is ultimately - // lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U. - debug_assert!(input_ty == types::I32X4); - - // Algorithm uses unpcklps to help create a float that is equivalent - // 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent - // every value of the mantissa represents a corresponding uint32 number. - // When we subtract 0x1.0p52 we are left with double(src). - let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::gen_move(dst, src, types::I32X4)); - - static UINT_MASK: [u8; 16] = [ - 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - ]; - - let uint_mask_const = - ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK)); - - ctx.emit(Inst::xmm_load_const( - uint_mask_const, - uint_mask, - types::I32X4, - )); - - // Creates 0x1.0p52 + double(src) - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Unpcklps, - RegMem::from(uint_mask), - dst, - )); - - static UINT_MASK_HIGH: [u8; 16] = [ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x30, 0x43, - ]; - - let uint_mask_high_const = - ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH)); - let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const( - uint_mask_high_const, - uint_mask_high, - types::I32X4, - )); - - // 0x1.0p52 + double(src) - 0x1.0p52 - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Subpd, - RegMem::from(uint_mask_high), - dst, - )); - } else { - panic!("Unsupported FcvtFromUint conversion types: {}", ty); - } - } else { - assert_eq!(ctx.input_ty(insn, 0), types::I32X4); - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512f_simd() { - // When AVX512VL and AVX512F are available, - // `fcvt_from_uint` can be lowered to a single instruction. - ctx.emit(Inst::xmm_unary_rm_r_evex( - Avx512Opcode::Vcvtudq2ps, - RegMem::reg(src), - dst, - )); - } else { - // Converting packed unsigned integers to packed floats - // requires a few steps. There is no single instruction - // lowering for converting unsigned floats but there is for - // converting packed signed integers to float (cvtdq2ps). In - // the steps below we isolate the upper half (16 bits) and - // lower half (16 bits) of each lane and then we convert - // each half separately using cvtdq2ps meant for signed - // integers. In order for this to work for the upper half - // bits we must shift right by 1 (divide by 2) these bits in - // order to ensure the most significant bit is 0 not signed, - // and then after the conversion we double the value. - // Finally we add the converted values where addition will - // correctly round. - // - // Sequence: - // -> A = 0xffffffff - // -> Ah = 0xffff0000 - // -> Al = 0x0000ffff - // -> Convert(Al) // Convert int to float - // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed - // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift - // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. - // -> dst = Ah + Al // Add the two floats together - - // Create a temporary register - let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Movapd, - RegMem::reg(src), - tmp, - )); - ctx.emit(Inst::gen_move(dst, src, ty)); - - // Get the low 16 bits - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); - - // Get the high 16 bits - ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); - - // Convert the low 16 bits - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvtdq2ps, - RegMem::from(tmp), - tmp, - )); - - // Shift the high bits by 1, convert, and double to get the correct value. - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvtdq2ps, - RegMem::from(dst), - dst, - )); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Addps, - RegMem::reg(dst.to_reg()), - dst, - )); - - // Add together the two converted values. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Addps, - RegMem::reg(tmp.to_reg()), - dst, - )); - } - } - } - Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { let src = put_input_in_reg(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index fb9abb6319b6..8fffd3857ffe 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -770,6 +770,18 @@ where fn jump_table_size(&mut self, targets: &BoxVecMachLabel) -> u32 { targets.len() as u32 } + + #[inline] + fn fcvt_uint_mask_const(&mut self) -> VCodeConstant { + self.lower_ctx + .use_constant(VCodeConstantData::WellKnown(&UINT_MASK)) + } + + #[inline] + fn fcvt_uint_mask_high_const(&mut self) -> VCodeConstant { + self.lower_ctx + .use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH)) + } } impl IsleContext<'_, C, Flags, IsaFlags, 6> @@ -891,3 +903,11 @@ fn to_simm32(constant: i64) -> Option { None } } + +const UINT_MASK: [u8; 16] = [ + 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +]; + +const UINT_MASK_HIGH: [u8; 16] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, +]; diff --git a/cranelift/filetests/filetests/isa/x64/fcvt-simd.clif b/cranelift/filetests/filetests/isa/x64/fcvt-simd.clif new file mode 100644 index 000000000000..8ac1b0d94c31 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fcvt-simd.clif @@ -0,0 +1,18 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx512vl has_avx512f + +function %f1(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtudq2ps %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 65d257050d35..09d7c80336eb 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -131,3 +131,72 @@ block0(v0: i32x4): ; popq %rbp ; ret +function %f10(i8, i16, i32, i64) -> f32 { +block0(v0: i8, v1: i16, v2: i32, v3: i64): + v4 = fcvt_from_uint.f32 v0 + v5 = fcvt_from_uint.f32 v1 + v6 = fcvt_from_uint.f32 v2 + v7 = fcvt_from_uint.f32 v3 + v8 = fadd.f32 v4, v5 + v9 = fadd.f32 v8, v6 + v10 = fadd.f32 v9, v7 + return v10 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbq %dil, %rax +; cvtsi2ss %rax, %xmm0 +; movzwq %si, %rax +; cvtsi2ss %rax, %xmm6 +; movl %edx, %eax +; cvtsi2ss %rax, %xmm7 +; u64_to_f32_seq %rcx, %xmm4, %r8, %rdx +; addss %xmm0, %xmm6, %xmm0 +; addss %xmm0, %xmm7, %xmm0 +; addss %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f11(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = uwiden_low v0 + v2 = fcvt_from_uint.f64x2 v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; load_const VCodeConstant(0), %xmm3 +; unpcklps %xmm0, %xmm3, %xmm0 +; load_const VCodeConstant(1), %xmm7 +; subpd %xmm0, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f12(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; pslld %xmm4, $16, %xmm4 +; psrld %xmm4, $16, %xmm4 +; psubd %xmm0, %xmm4, %xmm0 +; cvtdq2ps %xmm4, %xmm9 +; psrld %xmm0, $1, %xmm0 +; cvtdq2ps %xmm0, %xmm0 +; addps %xmm0, %xmm0, %xmm0 +; addps %xmm0, %xmm9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +