bytecodealliance · cfallin · Aug 11, 2022 · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022
@@ -1658,6 +1658,10 @@
 (rule (x64_movdqu from)
       (xmm_unary_rm_r (SseOpcode.Movdqu) from))
 
+(decl x64_movapd (XmmMem) Xmm)
+(rule (x64_movapd src)
+      (xmm_unary_rm_r (SseOpcode.Movapd) src))
+
 (decl x64_pmovsxbw (XmmMem) Xmm)
 (rule (x64_pmovsxbw from)
       (xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
@@ -2272,6 +2276,11 @@
 (rule (x64_punpcklwd src1 src2)
       (xmm_rm_r $I16X8 (SseOpcode.Punpcklwd) src1 src2))
 
+;; Helper for creating `unpcklps` instructions.
+(decl x64_unpcklps (Xmm XmmMem) Xmm)
+(rule (x64_unpcklps src1 src2)
+      (xmm_rm_r $I16X8 (SseOpcode.Unpcklps) src1 src2))
+
 ;; Helper for creating `andnps` instructions.
 (decl x64_andnps (Xmm XmmMem) Xmm)
 (rule (x64_andnps src1 src2)
@@ -2624,6 +2633,11 @@
             (_ Unit (emit (MInst.XmmUnaryRmREvex op src dst))))
         dst))
 
+;; Helper for creating `vcvtudq2ps` instructions.
+(decl x64_vcvtudq2ps (XmmMem) Xmm)
+(rule (x64_vcvtudq2ps src)
+      (xmm_unary_rm_r_evex (Avx512Opcode.Vcvtudq2ps) src))
+
 ;; Helper for creating `vpabsq` instructions.
 (decl x64_vpabsq (XmmMem) Xmm)
 (rule (x64_vpabsq src)
@@ -3014,6 +3028,23 @@
             (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
         dst))
 
+(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
+(rule (cvt_u64_to_float_seq ty src)
+      (let ((size OperandSize (raw_operand_size_of_type ty))
+            (src_copy WritableGpr (temp_writable_gpr))
+            (dst WritableXmm (temp_writable_xmm))
+            (tmp_gpr1 WritableGpr (temp_writable_gpr))
+            (tmp_gpr2 WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (gen_move $I64 src_copy src)))
+            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
+        dst))
+
+(decl fcvt_uint_mask_const () VCodeConstant)
+(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)
+
+(decl fcvt_uint_mask_high_const () VCodeConstant)
+(extern constructor fcvt_uint_mask_high_const fcvt_uint_mask_high_const)
+
 ;; Helpers for creating `pcmpeq*` instructions.
 (decl x64_pcmpeq (Type Xmm XmmMem) Xmm)
 (rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))

@@ -26,6 +26,16 @@ impl Inst {
             dst: WritableGpr::from_writable_reg(src).unwrap(),
         }
     }
+
+    fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::Float);
+        debug_assert!(dst.to_reg().class() == RegClass::Float);
+        Inst::XmmUnaryRmREvex {
+            op,
+            src: XmmMem::new(src).unwrap(),
+            dst: WritableXmm::from_writable_reg(dst).unwrap(),
+        }
+    }
 }
 
 #[test]

@@ -307,16 +307,6 @@ impl Inst {
         }
     }
 
-    pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
-        src.assert_regclass_is(RegClass::Float);
-        debug_assert!(dst.to_reg().class() == RegClass::Float);
-        Inst::XmmUnaryRmREvex {
-            op,
-            src: XmmMem::new(src).unwrap(),
-            dst: WritableXmm::from_writable_reg(dst).unwrap(),
-        }
-    }
-
     pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
         src.assert_regclass_is(RegClass::Float);
         debug_assert!(dst.to_reg().class() == RegClass::Float);
@@ -417,27 +407,6 @@ impl Inst {
         Inst::XmmCmpRmR { op, src, dst }
     }
 
-    pub(crate) fn cvt_u64_to_float_seq(
-        dst_size: OperandSize,
-        src: Writable<Reg>,
-        tmp_gpr1: Writable<Reg>,
-        tmp_gpr2: Writable<Reg>,
-        dst: Writable<Reg>,
-    ) -> Inst {
-        debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        debug_assert!(src.to_reg().class() == RegClass::Int);
-        debug_assert!(tmp_gpr1.to_reg().class() == RegClass::Int);
-        debug_assert!(tmp_gpr2.to_reg().class() == RegClass::Int);
-        debug_assert!(dst.to_reg().class() == RegClass::Float);
-        Inst::CvtUint64ToFloatSeq {
-            src: WritableGpr::from_writable_reg(src).unwrap(),
-            dst: WritableXmm::from_writable_reg(dst).unwrap(),
-            tmp_gpr1: WritableGpr::from_writable_reg(tmp_gpr1).unwrap(),
-            tmp_gpr2: WritableGpr::from_writable_reg(tmp_gpr2).unwrap(),
-            dst_size,
-        }
-    }
-
     pub(crate) fn cvt_float_to_sint_seq(
         src_size: OperandSize,
         dst_size: OperandSize,

@@ -3013,3 +3013,76 @@
 
 (rule (lower (fcvt_low_from_sint a @ (value_type ty)))
       (x64_cvtdq2pd ty a))
+
+;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
+      (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+
+(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
+      (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+
+(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
+      (cvt_u64_to_float_seq ty val))
+
+;; Algorithm uses unpcklps to help create a float that is equivalent
+;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
+;; every value of the mantissa represents a corresponding uint32 number.
+;; When we subtract 0x1.0p52 we are left with double(src).
+(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
+      (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
+            (res Xmm (x64_unpcklps val uint_mask))
+            (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
+        (x64_subpd res uint_mask_high)))
+
+;; When AVX512VL and AVX512F are available,
+;; `fcvt_from_uint` can be lowered to a single instruction.
+;;
+;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
+;; as it doesn't require either of the avx512 extensions to be enabled.
+(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
+                         (fcvt_from_uint src)))
+      (x64_vcvtudq2ps src))
+
+;; Converting packed unsigned integers to packed floats
+;; requires a few steps. There is no single instruction
+;; lowering for converting unsigned floats but there is for
+;; converting packed signed integers to float (cvtdq2ps). In
+;; the steps below we isolate the upper half (16 bits) and
+;; lower half (16 bits) of each lane and then we convert
+;; each half separately using cvtdq2ps meant for signed
+;; integers. In order for this to work for the upper half
+;; bits we must shift right by 1 (divide by 2) these bits in
+;; order to ensure the most significant bit is 0 not signed,
+;; and then after the conversion we double the value.
+;; Finally we add the converted values where addition will
+;; correctly round.
+;;
+;; Sequence:
+;; -> A = 0xffffffff
+;; -> Ah = 0xffff0000
+;; -> Al = 0x0000ffff
+;; -> Convert(Al) // Convert int to float
+;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
+;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
+;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
+;; -> dst = Ah + Al // Add the two floats together
+(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
+      (let (;;  get the low 16 bits
+            (a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
+            (a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
+
+            ;; get the high 16 bits
+            (a_hi Xmm (x64_psubd a a_lo))
+
+            ;; convert the low 16 bits
+            (a_lo Xmm (x64_cvtdq2ps a_lo))
+
+            ;; shift the high bits by 1, convert, and double to get the correct
+            ;; value
+            (a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
+            (a_hi Xmm (x64_cvtdq2ps a_hi))
+            (a_hi Xmm (x64_addps a_hi a_hi)))
+
+        ;; add together the two converted values
+        (x64_addps a_hi a_lo)))