Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Lower fcvt_from_uint in ISLE #4684

Merged
merged 9 commits into from
Aug 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1658,6 +1658,10 @@
(rule (x64_movdqu from)
(xmm_unary_rm_r (SseOpcode.Movdqu) from))

(decl x64_movapd (XmmMem) Xmm)
(rule (x64_movapd src)
(xmm_unary_rm_r (SseOpcode.Movapd) src))

(decl x64_pmovsxbw (XmmMem) Xmm)
(rule (x64_pmovsxbw from)
(xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
Expand Down Expand Up @@ -2272,6 +2276,11 @@
(rule (x64_punpcklwd src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Punpcklwd) src1 src2))

;; Helper for creating `unpcklps` instructions.
(decl x64_unpcklps (Xmm XmmMem) Xmm)
(rule (x64_unpcklps src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Unpcklps) src1 src2))

;; Helper for creating `andnps` instructions.
(decl x64_andnps (Xmm XmmMem) Xmm)
(rule (x64_andnps src1 src2)
Expand Down Expand Up @@ -2624,6 +2633,11 @@
(_ Unit (emit (MInst.XmmUnaryRmREvex op src dst))))
dst))

;; Helper for creating `vcvtudq2ps` instructions.
(decl x64_vcvtudq2ps (XmmMem) Xmm)
(rule (x64_vcvtudq2ps src)
(xmm_unary_rm_r_evex (Avx512Opcode.Vcvtudq2ps) src))

;; Helper for creating `vpabsq` instructions.
(decl x64_vpabsq (XmmMem) Xmm)
(rule (x64_vpabsq src)
Expand Down Expand Up @@ -3014,6 +3028,23 @@
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
dst))

(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
(src_copy WritableGpr (temp_writable_gpr))
(dst WritableXmm (temp_writable_xmm))
(tmp_gpr1 WritableGpr (temp_writable_gpr))
(tmp_gpr2 WritableGpr (temp_writable_gpr))
(_ Unit (emit (gen_move $I64 src_copy src)))
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
dst))

(decl fcvt_uint_mask_const () VCodeConstant)
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)

(decl fcvt_uint_mask_high_const () VCodeConstant)
(extern constructor fcvt_uint_mask_high_const fcvt_uint_mask_high_const)

;; Helpers for creating `pcmpeq*` instructions.
(decl x64_pcmpeq (Type Xmm XmmMem) Xmm)
(rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ impl Inst {
dst: WritableGpr::from_writable_reg(src).unwrap(),
}
}

fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmREvex {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}

#[test]
Expand Down
31 changes: 0 additions & 31 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,16 +307,6 @@ impl Inst {
}
}

pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmREvex {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Expand Down Expand Up @@ -417,27 +407,6 @@ impl Inst {
Inst::XmmCmpRmR { op, src, dst }
}

pub(crate) fn cvt_u64_to_float_seq(
dst_size: OperandSize,
src: Writable<Reg>,
tmp_gpr1: Writable<Reg>,
tmp_gpr2: Writable<Reg>,
dst: Writable<Reg>,
) -> Inst {
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(src.to_reg().class() == RegClass::Int);
debug_assert!(tmp_gpr1.to_reg().class() == RegClass::Int);
debug_assert!(tmp_gpr2.to_reg().class() == RegClass::Int);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::CvtUint64ToFloatSeq {
src: WritableGpr::from_writable_reg(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
tmp_gpr1: WritableGpr::from_writable_reg(tmp_gpr1).unwrap(),
tmp_gpr2: WritableGpr::from_writable_reg(tmp_gpr2).unwrap(),
dst_size,
}
}

pub(crate) fn cvt_float_to_sint_seq(
src_size: OperandSize,
dst_size: OperandSize,
Expand Down
73 changes: 73 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3013,3 +3013,76 @@

(rule (lower (fcvt_low_from_sint a @ (value_type ty)))
(x64_cvtdq2pd ty a))

;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
(cvt_u64_to_float_seq ty val))

;; Algorithm uses unpcklps to help create a float that is equivalent
;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
;; every value of the mantissa represents a corresponding uint32 number.
;; When we subtract 0x1.0p52 we are left with double(src).
(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
(let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
(res Xmm (x64_unpcklps val uint_mask))
(uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
(x64_subpd res uint_mask_high)))

;; When AVX512VL and AVX512F are available,
;; `fcvt_from_uint` can be lowered to a single instruction.
;;
;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
;; as it doesn't require either of the avx512 extensions to be enabled.
(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
(fcvt_from_uint src)))
(x64_vcvtudq2ps src))

;; Converting packed unsigned integers to packed floats
;; requires a few steps. There is no single instruction
;; lowering for converting unsigned floats but there is for
;; converting packed signed integers to float (cvtdq2ps). In
;; the steps below we isolate the upper half (16 bits) and
;; lower half (16 bits) of each lane and then we convert
;; each half separately using cvtdq2ps meant for signed
;; integers. In order for this to work for the upper half
;; bits we must shift right by 1 (divide by 2) these bits in
;; order to ensure the most significant bit is 0 not signed,
;; and then after the conversion we double the value.
;; Finally we add the converted values where addition will
;; correctly round.
;;
;; Sequence:
;; -> A = 0xffffffff
;; -> Ah = 0xffff0000
;; -> Al = 0x0000ffff
;; -> Convert(Al) // Convert int to float
;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
;; -> dst = Ah + Al // Add the two floats together
(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
(let (;; get the low 16 bits
(a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
(a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))

;; get the high 16 bits
(a_hi Xmm (x64_psubd a a_lo))

;; convert the low 16 bits
(a_lo Xmm (x64_cvtdq2ps a_lo))

;; shift the high bits by 1, convert, and double to get the correct
;; value
(a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
(a_hi Xmm (x64_cvtdq2ps a_hi))
(a_hi Xmm (x64_addps a_hi a_hi)))

;; add together the two converted values
(x64_addps a_hi a_lo)))
Loading