Skip to content
This repository has been archived by the owner on Jun 26, 2020. It is now read-only.

Commit

Permalink
Add x86 implementation of splat instruction
Browse files Browse the repository at this point in the history
  • Loading branch information
abrown committed Jul 12, 2019
1 parent 232d1c1 commit 03c7803
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 2 deletions.
93 changes: 92 additions & 1 deletion cranelift-codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::cdsl::ast::{var, ExprBuilder, Literal};
use crate::cdsl::instructions::InstructionGroup;
use crate::cdsl::types::ValueType;
use crate::cdsl::xform::TransformGroupBuilder;

use crate::shared::types::Float::F64;
use crate::shared::types::Int::{I32, I64};
use crate::shared::Definitions as SharedDefinitions;

Expand All @@ -19,9 +20,11 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
// List of instructions.
let insts = &shared.instructions;
let band = insts.by_name("band");
let bitcast = insts.by_name("bitcast");
let bor = insts.by_name("bor");
let clz = insts.by_name("clz");
let ctz = insts.by_name("ctz");
let f64const = insts.by_name("f64const");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint = insts.by_name("fcvt_to_sint");
Expand All @@ -33,11 +36,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
let iadd = insts.by_name("iadd");
let iconst = insts.by_name("iconst");
let imul = insts.by_name("imul");
let insertlane = insts.by_name("insertlane");
let isub = insts.by_name("isub");
let popcnt = insts.by_name("popcnt");
let raw_bitcast = insts.by_name("raw_bitcast");
let scalar_to_vector = insts.by_name("scalar_to_vector");
let sdiv = insts.by_name("sdiv");
let selectif = insts.by_name("selectif");
let smulhi = insts.by_name("smulhi");
let splat = insts.by_name("splat");
let srem = insts.by_name("srem");
let udiv = insts.by_name("udiv");
let umulhi = insts.by_name("umulhi");
Expand All @@ -46,13 +53,17 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

let x86_bsf = x86_instructions.by_name("x86_bsf");
let x86_bsr = x86_instructions.by_name("x86_bsr");
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_umulx = x86_instructions.by_name("x86_umulx");
let x86_smulx = x86_instructions.by_name("x86_smulx");

// List of immediates.
let floatcc = shared.operand_kinds.by_name("floatcc");
let imm64 = shared.operand_kinds.by_name("imm64");
let intcc = shared.operand_kinds.by_name("intcc");
let uimm8 = shared.operand_kinds.by_name("uimm8");
let ieee64 = shared.operand_kinds.by_name("ieee64");

// Division and remainder.
//
Expand Down Expand Up @@ -290,4 +301,84 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
);

group.build_and_add_to(&mut shared.transform_groups);

let mut narrow = TransformGroupBuilder::new(
"x86_narrow",
r#"
Legalize instructions by narrowing.
Use x86-specific instructions if needed."#,
)
.isa("x86")
.chain_with(shared.transform_groups.by_name("narrow").id);

// SIMD
let uimm8_zero = Literal::constant(uimm8, 0x00);
let uimm8_one = Literal::constant(uimm8, 0x01);
let ieee64_zero = Literal::constant(ieee64, 0x00);
let b = var("b");
let c = var("c");
let d = var("d");

// SIMD splat: 8-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
narrow.legalize(
def!(y = splat_x8x16(x)),
vec![
def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type
def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b)
],
);
}

// SIMD splat: 16-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
.bind_vector(I32, 4)
.bind_vector(ty, 128 / ty.lane_bits());
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
.bind_vector(ty, 128 / ty.lane_bits())
.bind_vector(I32, 4);
narrow.legalize(
def!(y = splat_x16x8(x)),
vec![
def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register
def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits
def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD
def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD
def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again
],
);
}

// SIMD splat: 32-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
narrow.legalize(
def!(y = splat_any32x4(x)),
vec![
def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register
def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF
],
);
}

// SIMD splat: 64-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
narrow.legalize(
def!(y = splat_any64x2(x)),
vec![
def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register
def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register
],
);
}

narrow.build_and_add_to(&mut shared.transform_groups);
}
3 changes: 2 additions & 1 deletion cranelift-codegen/meta/src/isa/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
let narrow = shared_defs.transform_groups.by_name("narrow");
let widen = shared_defs.transform_groups.by_name("widen");
let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
let x86_expand = shared_defs.transform_groups.by_name("x86_expand");

x86_32.legalize_monomorphic(expand_flags);
Expand All @@ -42,7 +43,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_32.legalize_type(F64, x86_expand);

x86_64.legalize_monomorphic(expand_flags);
x86_64.legalize_default(narrow);
x86_64.legalize_default(x86_narrow);
x86_64.legalize_type(B1, expand_flags);
x86_64.legalize_type(I8, widen);
x86_64.legalize_type(I16, widen);
Expand Down
6 changes: 6 additions & 0 deletions cranelift-codegen/src/ir/immediates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,12 @@ impl From<f64> for Ieee64 {
}
}

impl From<u64> for Ieee64 {
fn from(x: u64) -> Self {
Ieee64::with_float(f64::from_bits(x))
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
73 changes: 73 additions & 0 deletions filetests/isa/x86/legalize-splat.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
test compile
set enable_simd=true
set probestack_enabled=false
target x86_64 haswell

; use baldrdash calling convention here for simplicity (avoids prologue, epilogue)
function %test_splat_i32() -> i32x4 baldrdash {
ebb0:
v0 = iconst.i32 42
v1 = splat.i32x4 v0
return v1
}

; sameln: function %test_splat_i32() -> i32x4 [%xmm0] baldrdash {
; nextln: ss0 = incoming_arg 0, offset 0
; nextln:
; nextln: ebb0:
; nextln: v0 = iconst.i32 42
; nextln: v2 = scalar_to_vector.i32x4 v0
; nextln: v1 = x86_pshufd v2, 0
; nextln: return v1
; nextln: }



function %test_splat_i64() -> i64x2 baldrdash {
ebb0:
v0 = iconst.i64 42
v1 = splat.i64x2 v0
return v1
}

; check: ebb0:
; nextln: v0 = iconst.i64 42
; nextln: v2 = scalar_to_vector.i64x2 v0
; nextln: v1 = insertlane v2, 1, v0
; nextln: return v1



function %test_splat_b16() -> b16x8 baldrdash {
ebb0:
v0 = bconst.b16 true
v1 = splat.b16x8 v0
return v1
}

; check: ebb0:
; nextln: v0 = bconst.b16 true
; nextln: v2 = scalar_to_vector.b16x8 v0
; nextln: v3 = insertlane v2, 1, v0
; nextln: v4 = raw_bitcast.i32x4 v3
; nextln: v5 = x86_pshufd v4, 0
; nextln: v1 = raw_bitcast.b16x8 v5
; nextln: return v1



function %test_splat_i8() -> i8x16 baldrdash {
ebb0:
v0 = iconst.i8 42
v1 = splat.i8x16 v0
return v1
}

; check: ebb0:
; nextln: v2 = iconst.i32 42
; nextln: v0 = ireduce.i8 v2
; nextln: v3 = scalar_to_vector.i8x16 v0
; nextln: v4 = f64const 0.0
; nextln: v5 = bitcast.i8x16 v4
; nextln: v1 = x86_pshufb v3, v5
; nextln: return v1

0 comments on commit 03c7803

Please sign in to comment.