Skip to content

Commit

Permalink
x64: lower fcvt_from_uint to VCVTUDQ2PS when possible
Browse files Browse the repository at this point in the history
When AVX512VL and AVX512F are available, use a single instruction
(`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This
optimization is a port from the legacy x86 backend.
  • Loading branch information
abrown committed May 19, 2021
1 parent 3b3b126 commit 54b45d2
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 62 deletions.
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode {

#[derive(Clone)]
pub enum Avx512Opcode {
Vcvtudq2ps,
Vpabsq,
Vpmullq,
}
Expand All @@ -1008,6 +1009,9 @@ impl Avx512Opcode {
/// Which `InstructionSet`s support the opcode?
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
match self {
Avx512Opcode::Vcvtudq2ps => {
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
}
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
}
Expand All @@ -1017,6 +1021,7 @@ impl Avx512Opcode {
impl fmt::Debug for Avx512Opcode {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let name = match self {
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
Avx512Opcode::Vpabsq => "vpabsq",
Avx512Opcode::Vpmullq => "vpmullq",
};
Expand Down
11 changes: 6 additions & 5 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1408,16 +1408,17 @@ pub(crate) fn emit(
}

Inst::XmmUnaryRmREvex { op, src, dst } => {
let opcode = match op {
Avx512Opcode::Vpabsq => 0x1f,
let (prefix, map, w, opcode) = match op {
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src {
RegMem::Reg { reg: src } => EvexInstruction::new()
.length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F38)
.w(true)
.prefix(prefix)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_reg().get_hw_encoding())
.rm(src.get_hw_encoding())
Expand Down
6 changes: 6 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3889,6 +3889,12 @@ fn test_x64_emit() {
"vpabsq %xmm2, %xmm8",
));

insns.push((
Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vcvtudq2ps, RegMem::reg(xmm2), w_xmm8),
"62717F087AC2",
"vcvtudq2ps %xmm2, %xmm8",
));

// Xmm to int conversions, and conversely.

insns.push((
Expand Down
112 changes: 63 additions & 49 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4069,64 +4069,78 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
};
} else {
// Converting packed unsigned integers to packed floats requires a few steps.
// There is no single instruction lowering for converting unsigned floats but there
// is for converting packed signed integers to float (cvtdq2ps). In the steps below
// we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
// then we convert each half separately using cvtdq2ps meant for signed integers.
// In order for this to work for the upper half bits we must shift right by 1
// (divide by 2) these bits in order to ensure the most significant bit is 0 not
// signed, and then after the conversion we double the value. Finally we add the
// converted values where addition will correctly round.
//
// Sequence:
// -> A = 0xffffffff
// -> Ah = 0xffff0000
// -> Al = 0x0000ffff
// -> Convert(Al) // Convert int to float
// -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
// -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
// -> dst = Ah + Al // Add the two floats together

assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

// Create a temporary register
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Movapd,
RegMem::reg(src),
tmp,
));
ctx.emit(Inst::gen_move(dst, src, ty));
if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
// When either AVX512VL or AVX512F are available,
// `fcvt_from_uint` can be lowered to a single instruction.
ctx.emit(Inst::xmm_unary_rm_r_evex(
Avx512Opcode::Vcvtudq2ps,
RegMem::reg(src),
dst,
));
} else {
// Converting packed unsigned integers to packed floats
// requires a few steps. There is no single instruction
// lowering for converting unsigned floats but there is for
// converting packed signed integers to float (cvtdq2ps). In
// the steps below we isolate the upper half (16 bits) and
// lower half (16 bits) of each lane and then we convert
// each half separately using cvtdq2ps meant for signed
// integers. In order for this to work for the upper half
// bits we must shift right by 1 (divide by 2) these bits in
// order to ensure the most significant bit is 0 not signed,
// and then after the conversion we double the value.
// Finally we add the converted values where addition will
// correctly round.
//
// Sequence:
// -> A = 0xffffffff
// -> Ah = 0xffff0000
// -> Al = 0x0000ffff
// -> Convert(Al) // Convert int to float
// -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
// -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
// -> dst = Ah + Al // Add the two floats together

// Create a temporary register
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Movapd,
RegMem::reg(src),
tmp,
));
ctx.emit(Inst::gen_move(dst, src, ty));

// Get the low 16 bits
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
// Get the low 16 bits
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));

// Get the high 16 bits
ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
// Get the high 16 bits
ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));

// Convert the low 16 bits
ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
// Convert the low 16 bits
ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));

// Shift the high bits by 1, convert, and double to get the correct value.
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(dst.to_reg()),
dst,
));
// Shift the high bits by 1, convert, and double to get the correct value.
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(dst.to_reg()),
dst,
));

// Add together the two converted values.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(tmp.to_reg()),
dst,
));
// Add together the two converted values.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(tmp.to_reg()),
dst,
));
}
}
}

Expand Down
20 changes: 12 additions & 8 deletions cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@ test run
set enable_simd
target x86_64 machinst

function %fcvt_from_sint() -> b1 {
block0:
v0 = vconst.i32x4 [-1 0 1 123456789]
function %fcvt_from_sint(i32x4) -> f32x4 {
block0(v0: i32x4):
v1 = fcvt_from_sint.f32x4 v0
return v1
}
; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
; Note that 123456789 rounds to 123456792.0, an error of 3

v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
v3 = fcmp eq v1, v2
v4 = vall_true v3
return v4
function %fcvt_from_uint(i32x4) -> f32x4 {
block0(v0: i32x4):
v1 = fcvt_from_uint.f32x4 v0
return v1
}
; run
; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.

function %fcvt_to_sint_sat(f32x4) -> i32x4 {
block0(v0:f32x4):
Expand Down

0 comments on commit 54b45d2

Please sign in to comment.