diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs index 2815665da..8423b1099 100644 --- a/cranelift-codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs @@ -1,7 +1,8 @@ use crate::cdsl::ast::{var, ExprBuilder, Literal}; use crate::cdsl::instructions::InstructionGroup; +use crate::cdsl::types::ValueType; use crate::cdsl::xform::TransformGroupBuilder; - +use crate::shared::types::Float::F64; use crate::shared::types::Int::{I32, I64}; use crate::shared::Definitions as SharedDefinitions; @@ -19,9 +20,11 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou // List of instructions. let insts = &shared.instructions; let band = insts.by_name("band"); + let bitcast = insts.by_name("bitcast"); let bor = insts.by_name("bor"); let clz = insts.by_name("clz"); let ctz = insts.by_name("ctz"); + let f64const = insts.by_name("f64const"); let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); let fcvt_to_sint = insts.by_name("fcvt_to_sint"); @@ -33,11 +36,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou let iadd = insts.by_name("iadd"); let iconst = insts.by_name("iconst"); let imul = insts.by_name("imul"); + let insertlane = insts.by_name("insertlane"); let isub = insts.by_name("isub"); let popcnt = insts.by_name("popcnt"); + let raw_bitcast = insts.by_name("raw_bitcast"); + let scalar_to_vector = insts.by_name("scalar_to_vector"); let sdiv = insts.by_name("sdiv"); let selectif = insts.by_name("selectif"); let smulhi = insts.by_name("smulhi"); + let splat = insts.by_name("splat"); let srem = insts.by_name("srem"); let udiv = insts.by_name("udiv"); let umulhi = insts.by_name("umulhi"); @@ -46,6 +53,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou let x86_bsf = x86_instructions.by_name("x86_bsf"); let x86_bsr = x86_instructions.by_name("x86_bsr"); + let x86_pshufb = x86_instructions.by_name("x86_pshufb"); + let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_umulx = x86_instructions.by_name("x86_umulx"); let x86_smulx = x86_instructions.by_name("x86_smulx"); @@ -53,6 +62,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou let floatcc = shared.operand_kinds.by_name("floatcc"); let imm64 = shared.operand_kinds.by_name("imm64"); let intcc = shared.operand_kinds.by_name("intcc"); + let uimm8 = shared.operand_kinds.by_name("uimm8"); + let ieee64 = shared.operand_kinds.by_name("ieee64"); // Division and remainder. // @@ -290,4 +301,84 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou ); group.build_and_add_to(&mut shared.transform_groups); + + let mut narrow = TransformGroupBuilder::new( + "x86_narrow", + r#" + Legalize instructions by narrowing. + + Use x86-specific instructions if needed."#, + ) + .isa("x86") + .chain_with(shared.transform_groups.by_name("narrow").id); + + // SIMD + let uimm8_zero = Literal::constant(uimm8, 0x00); + let uimm8_one = Literal::constant(uimm8, 0x01); + let ieee64_zero = Literal::constant(ieee64, 0x00); + let b = var("b"); + let c = var("c"); + let d = var("d"); + + // SIMD splat: 8-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { + let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits()); + let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64); + narrow.legalize( + def!(y = splat_x8x16(x)), + vec![ + def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register + def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0 + def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type + def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b) + ], + ); + } + + // SIMD splat: 16-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) { + let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits()); + let raw_bitcast_any16x8_to_i32x4 = raw_bitcast + .bind_vector(I32, 4) + .bind_vector(ty, 128 / ty.lane_bits()); + let raw_bitcast_i32x4_to_any16x8 = raw_bitcast + .bind_vector(ty, 128 / ty.lane_bits()) + .bind_vector(I32, 4); + narrow.legalize( + def!(y = splat_x16x8(x)), + vec![ + def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register + def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits + def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD + def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD + def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again + ], + ); + } + + // SIMD splat: 32-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { + let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits()); + narrow.legalize( + def!(y = splat_any32x4(x)), + vec![ + def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register + def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF + ], + ); + } + + // SIMD splat: 64-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) { + let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits()); + narrow.legalize( + def!(y = splat_any64x2(x)), + vec![ + def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register + def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register + ], + ); + } + + narrow.build_and_add_to(&mut shared.transform_groups); } diff --git a/cranelift-codegen/meta/src/isa/x86/mod.rs b/cranelift-codegen/meta/src/isa/x86/mod.rs index a8002890f..9f4a4a334 100644 --- a/cranelift-codegen/meta/src/isa/x86/mod.rs +++ b/cranelift-codegen/meta/src/isa/x86/mod.rs @@ -30,6 +30,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { let expand_flags = shared_defs.transform_groups.by_name("expand_flags"); let narrow = shared_defs.transform_groups.by_name("narrow"); let widen = shared_defs.transform_groups.by_name("widen"); + let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow"); let x86_expand = shared_defs.transform_groups.by_name("x86_expand"); x86_32.legalize_monomorphic(expand_flags); @@ -42,7 +43,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_32.legalize_type(F64, x86_expand); x86_64.legalize_monomorphic(expand_flags); - x86_64.legalize_default(narrow); + x86_64.legalize_default(x86_narrow); x86_64.legalize_type(B1, expand_flags); x86_64.legalize_type(I8, widen); x86_64.legalize_type(I16, widen); diff --git a/cranelift-codegen/src/ir/immediates.rs b/cranelift-codegen/src/ir/immediates.rs index 73d1d8cc2..4a8d5e1fb 100644 --- a/cranelift-codegen/src/ir/immediates.rs +++ b/cranelift-codegen/src/ir/immediates.rs @@ -738,6 +738,12 @@ impl From for Ieee64 { } } +impl From for Ieee64 { + fn from(x: u64) -> Self { + Ieee64::with_float(f64::from_bits(x)) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/filetests/isa/x86/legalize-splat.clif b/filetests/isa/x86/legalize-splat.clif new file mode 100644 index 000000000..c50f5a2f8 --- /dev/null +++ b/filetests/isa/x86/legalize-splat.clif @@ -0,0 +1,73 @@ +test compile +set enable_simd=true +set probestack_enabled=false +target x86_64 haswell + +; use baldrdash calling convention here for simplicity (avoids prologue, epilogue) +function %test_splat_i32() -> i32x4 baldrdash { +ebb0: + v0 = iconst.i32 42 + v1 = splat.i32x4 v0 + return v1 +} + +; sameln: function %test_splat_i32() -> i32x4 [%xmm0] baldrdash { +; nextln: ss0 = incoming_arg 0, offset 0 +; nextln: +; nextln: ebb0: +; nextln: v0 = iconst.i32 42 +; nextln: v2 = scalar_to_vector.i32x4 v0 +; nextln: v1 = x86_pshufd v2, 0 +; nextln: return v1 +; nextln: } + + + +function %test_splat_i64() -> i64x2 baldrdash { +ebb0: + v0 = iconst.i64 42 + v1 = splat.i64x2 v0 + return v1 +} + +; check: ebb0: +; nextln: v0 = iconst.i64 42 +; nextln: v2 = scalar_to_vector.i64x2 v0 +; nextln: v1 = insertlane v2, 1, v0 +; nextln: return v1 + + + +function %test_splat_b16() -> b16x8 baldrdash { +ebb0: + v0 = bconst.b16 true + v1 = splat.b16x8 v0 + return v1 +} + +; check: ebb0: +; nextln: v0 = bconst.b16 true +; nextln: v2 = scalar_to_vector.b16x8 v0 +; nextln: v3 = insertlane v2, 1, v0 +; nextln: v4 = raw_bitcast.i32x4 v3 +; nextln: v5 = x86_pshufd v4, 0 +; nextln: v1 = raw_bitcast.b16x8 v5 +; nextln: return v1 + + + +function %test_splat_i8() -> i8x16 baldrdash { +ebb0: + v0 = iconst.i8 42 + v1 = splat.i8x16 v0 + return v1 +} + +; check: ebb0: +; nextln: v2 = iconst.i32 42 +; nextln: v0 = ireduce.i8 v2 +; nextln: v3 = scalar_to_vector.i8x16 v0 +; nextln: v4 = f64const 0.0 +; nextln: v5 = bitcast.i8x16 v4 +; nextln: v1 = x86_pshufb v3, v5 +; nextln: return v1