Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fcvt_to_uint_sat (f32x4 -> i32x4) for x86 #1822

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("simd", _) if target.contains("aarch64") => return true,

("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
("simd", "simd_load") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
("simd", "simd_splat") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S

// Still working on implementing these. See #929.
("reference_types", "global")
Expand Down
7 changes: 7 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,7 @@ fn define_simd(
let x86_ptest = x86.by_name("x86_ptest");
let x86_punpckh = x86.by_name("x86_punpckh");
let x86_punpckl = x86.by_name("x86_punpckl");
let x86_vcvtps2udq = x86.by_name("x86_vcvtps2udq");
let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");

// Shorthands for recipes.
Expand Down Expand Up @@ -1951,6 +1952,12 @@ fn define_simd(
Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
);

e.enc_32_64_maybe_isap(
x86_vcvtps2udq,
rec_evex_reg_rm_128.opcodes(&VCVTPS2UDQ),
Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
);

e.enc_both_inferred(
x86_cvtt2si
.bind(vector(I32, sse_vector_size))
Expand Down
25 changes: 23 additions & 2 deletions cranelift/codegen/meta/src/isa/x86/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,29 @@ pub(crate) fn define(
r#"
Convert unsigned integer to floating point.

Convert packed doubleword unsigned integers to packed single-precision floating-point
values. This instruction does not trap.
Convert packed doubleword unsigned integers to packed single-precision floating-point
values. When a conversion is inexact, the value returned is rounded according to the
rounding control bits in the MXCSR register or the embedded rounding control bits. This
instruction does not trap.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);

let x = &Operand::new("x", f32x4);
let a = &Operand::new("a", i32x4);
ig.push(
Inst::new(
"x86_vcvtps2udq",
r#"
Convert floating point to unsigned integer.

Convert packed single-precision floating-point to packed doubleword unsigned integers
values. When a conversion is inexact, the value returned is rounded according to the
rounding control bits in the MXCSR register or the embedded rounding control bits. This
instruction does not trap.
"#,
&formats.unary,
)
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ fn define_simd(
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
let fmax = insts.by_name("fmax");
let fmin = insts.by_name("fmin");
let fneg = insts.by_name("fneg");
Expand Down Expand Up @@ -797,4 +798,5 @@ fn define_simd(

narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
}
2 changes: 2 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
x86_32.legalize_type(F32, x86_expand);
x86_32.legalize_type(F64, x86_expand);
x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

Expand All @@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
x86_64.legalize_type(F32, x86_expand);
x86_64.legalize_type(F64, x86_expand);
x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

Expand Down
6 changes: 6 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/opcodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,12 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
/// Raise invalid opcode instruction.
pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];

/// Convert four packed single precision floating-point values from xmm2/m128/m32bcst to four packed
/// unsigned doubleword values in xmm1 using truncation subject to writemask k1. Rounding behavior
/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode (AVX512VL,
/// AVX512F).
pub static VCVTPS2UDQ: [u8; 2] = [0x0f, 0x79];
abrown marked this conversation as resolved.
Show resolved Hide resolved

/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode
Expand Down
1 change: 1 addition & 0 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2063,6 +2063,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Packss
| Opcode::X86Punpckh
| Opcode::X86Punpckl
| Opcode::X86Vcvtps2udq
| Opcode::X86Vcvtudq2ps
| Opcode::X86ElfTlsGetAddr
| Opcode::X86MachoTlsGetAddr => {
Expand Down
83 changes: 83 additions & 0 deletions cranelift/codegen/src/isa/x86/enc_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,89 @@ fn expand_fcvt_to_uint_sat(
cfg.recompute_block(pos.func, done);
}

// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4.
static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [
0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f,
];

/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes
/// using VCVTPS2UDQ if AVX512 is available or to a longer sequence of NaN quieting and truncation
/// otherwise. This logic is separate from [expand_fcvt_to_uint_sat] above (the scalar version),
abrown marked this conversation as resolved.
Show resolved Hide resolved
/// only due to how the transform groups are set up; TODO if we change the SIMD legalization groups,
/// then this logic could be merged into [expand_fcvt_to_uint_sat] (see
/// https://github.com/bytecodealliance/wasmtime/issues/1745).
fn expand_fcvt_to_uint_sat_vector(
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);

if let ir::InstructionData::Unary {
opcode: ir::Opcode::FcvtToUintSat,
arg,
} = pos.func.dfg[inst]
{
let controlling_type = pos.func.dfg.ctrl_typevar(inst);
if controlling_type == I32X4 {
debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
let x86_isa = isa
.as_any()
.downcast_ref::<isa::x86::Isa>()
.expect("the target ISA must be x86 at this point");
if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the rounding semantics of the two alternative implementations equivalent? For the short case, vcvtps2udq, the Intel manual says "When a conversion is inexact, the value returned is rounded according to the rounding control bits in the MXCSR register or the embedded rounding control bits." The slow path appears to use cvtt2si (presumably, really, cvttps2pi), and all the tt variant conversion insns are round-towards-zero: "When a conversion is inexact, a truncated (round toward zero) result is returned."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point: I had hoped MXCSR would have the right rounding mode but it does not. I added a run filetest that passes with the long sequence and fails with the AVX512 sequence: it looks like the machines running the CI have AVX512 and therefore fail. I started playing around with a new recipe that embeds the rounding control (e.g. EvexContext::RoundingRegToRegFP { rc: EvexRoundingControl::RZ } but this didn't immediately fix the test case (the emitted bytes aren't exactly right) so I will have to play around with this a bit more.

// If certain AVX512 features are available, we can emit a single instruction.
pos.func.dfg.replace(inst).x86_vcvtps2udq(arg);
} else {
// Otherwise, we must both quiet any NaNs--setting that lane to 0--and saturate any
// lanes that might overflow during conversion to the highest/lowest integer
// allowed in that lane.
let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into());
let max_signed_constant = pos
.func
.dfg
.constants
.insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into());
let zeroes = pos.ins().vconst(F32X4, zeroes_constant);
let max_signed = pos.ins().vconst(F32X4, max_signed_constant);
// Clamp the input to 0 for negative floating point numbers. TODO we need to
// convert NaNs to 0 but this doesn't do that?
let ge_zero = pos.ins().x86_fmax(arg, zeroes);
// Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert.
// For floating point numbers above this, CVTTPS2DQ returns the undefined value
// 0x80000000.
let minus_max_signed = pos.ins().fsub(ge_zero, max_signed);
let le_max_signed =
pos.ins()
.fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed);
// Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0.
// These lanes have the MSB set to 1 after the XOR. We are trying to calculate a
// valid, in-range addend.
let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed);
let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed);
let difference = pos
.ins()
.bxor(minus_max_signed_as_int, le_max_signed_as_int);
// Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified
// previously (MSB set to 1).
let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes);
let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int);
// Convert the original clamped number to an integer and add back in the addend
// (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these).
let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero);
pos.func.dfg.replace(inst).iadd(converted, addend);
}
} else {
unreachable!(
"{} should not be legalized in expand_fcvt_to_uint_sat_vector",
pos.func.dfg.display_inst(inst, None)
)
}
}
}

/// Convert shuffle instructions.
fn convert_shuffle(
inst: ir::Inst,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ block0(v0: i32x4 [%xmm2]):
[-, %xmm6] v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2
return
}

function %fcvt_to_uint(f32x4) {
block0(v0: f32x4 [%xmm2]):
[-, %xmm6] v1 = x86_vcvtps2udq v0 ; bin: 62 f1 7c 08 79 f2
return
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,10 @@ block0(v0:i32x4):
; check: v1 = x86_vcvtudq2ps v0
return v1
}

function %fcvt_to_uint_sat(f32x4) -> i32x4 {
block0(v0:f32x4):
v1 = fcvt_to_uint_sat.i32x4 v0
; check: v1 = x86_vcvtps2udq v0
return v1
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,23 @@ block0(v0:f32x4):
; nextln: v1 = bxor v7, v9
return v1
}

function %fcvt_to_uint_sat(f32x4) -> i32x4 {
; check: const0 = 0x00000000000000000000000000000000
; nextln: const1 = 0x4f0000004f0000004f0000004f000000
block0(v0:f32x4):
v1 = fcvt_to_uint_sat.i32x4 v0
; check: v2 = vconst.f32x4 const0
; nextln: v3 = vconst.f32x4 const1
; nextln: v4 = x86_fmax v0, v2
; nextln: v5 = fsub v4, v3
; nextln: v6 = fcmp le v3, v5
; nextln: v7 = x86_cvtt2si.i32x4 v5
; nextln: v8 = raw_bitcast.i32x4 v6
; nextln: v9 = bxor v7, v8
; nextln: v10 = raw_bitcast.i32x4 v2
; nextln: v11 = x86_pmaxs v9, v10
; nextln: v12 = x86_cvtt2si.i32x4 v4
; nextln: v1 = iadd v12, v11
return v1
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,12 @@ block0(v0:f32x4):
}
; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]

function %fcvt_to_uint_sat(f32x4) -> i32x4 {
block0(v0:f32x4):
v1 = fcvt_to_uint_sat.i32x4 v0
return v1
}
; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
7 changes: 5 additions & 2 deletions cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1559,8 +1559,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let a = pop1_with_bitcast(state, F32X4, builder);
state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
}
Operator::I32x4TruncSatF32x4U
| Operator::I8x16NarrowI16x8S { .. }
Operator::I32x4TruncSatF32x4U => {
let a = pop1_with_bitcast(state, F32X4, builder);
state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a))
}
Operator::I8x16NarrowI16x8S { .. }
| Operator::I8x16NarrowI16x8U { .. }
| Operator::I16x8NarrowI32x4S { .. }
| Operator::I16x8NarrowI32x4U { .. }
Expand Down