Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s390x: Implement full SIMD support #4427

Merged
merged 1 commit into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ fn write_testsuite_tests(
fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
match strategy {
"Cranelift" => match (testsuite, testname) {
// No simd support yet for s390x.
("simd", _) if platform_is_s390x() => return true,
_ if platform_is_s390x() && testname.starts_with("simd") => return true,
// FIXME: These tests fail under qemu due to a qemu bug.
(_, "simd_f32x4_pmin_pmax") if platform_is_s390x() => return true,
(_, "simd_f64x2_pmin_pmax") if platform_is_s390x() => return true,
_ => {}
},
_ => panic!("unrecognized strategy"),
Expand Down
4 changes: 2 additions & 2 deletions cranelift/codegen/src/data_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ impl DataValue {
DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]),
DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]),
DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]),
DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]),
DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()),
_ => unimplemented!(),
};
}
Expand Down Expand Up @@ -120,7 +120,7 @@ impl DataValue {
DataValue::B(src[..size].iter().any(|&i| i != 0))
}
_ if ty.is_vector() && ty.bytes() == 16 => {
DataValue::V128(src[..16].try_into().unwrap())
DataValue::V128(u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes())
}
_ => unimplemented!(),
}
Expand Down
98 changes: 90 additions & 8 deletions cranelift/codegen/src/isa/s390x/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ fn in_flt_reg(ty: Type) -> bool {
}
}

fn in_vec_reg(ty: Type) -> bool {
ty.is_vector() && ty.bits() == 128
}

fn get_intreg_for_arg(idx: usize) -> Option<Reg> {
match idx {
0 => Some(regs::gpr(2)),
Expand All @@ -118,6 +122,20 @@ fn get_fltreg_for_arg(idx: usize) -> Option<Reg> {
}
}

fn get_vecreg_for_arg(idx: usize) -> Option<Reg> {
match idx {
0 => Some(regs::vr(24)),
1 => Some(regs::vr(25)),
2 => Some(regs::vr(26)),
3 => Some(regs::vr(27)),
4 => Some(regs::vr(28)),
5 => Some(regs::vr(29)),
6 => Some(regs::vr(30)),
7 => Some(regs::vr(31)),
_ => None,
}
}

fn get_intreg_for_ret(idx: usize) -> Option<Reg> {
match idx {
0 => Some(regs::gpr(2)),
Expand All @@ -140,6 +158,21 @@ fn get_fltreg_for_ret(idx: usize) -> Option<Reg> {
}
}

fn get_vecreg_for_ret(idx: usize) -> Option<Reg> {
match idx {
0 => Some(regs::vr(24)),
// ABI extension to support multi-value returns:
1 => Some(regs::vr(25)),
2 => Some(regs::vr(26)),
3 => Some(regs::vr(27)),
4 => Some(regs::vr(28)),
5 => Some(regs::vr(29)),
6 => Some(regs::vr(30)),
7 => Some(regs::vr(31)),
_ => None,
}
}

/// This is the limit for the size of argument and return-value areas on the
/// stack. We place a reasonable limit here to avoid integer overflow issues
/// with 32-bit arithmetic: for now, 128 MB.
Expand Down Expand Up @@ -182,6 +215,7 @@ impl ABIMachineSpec for S390xMachineDeps {
) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
let mut next_gpr = 0;
let mut next_fpr = 0;
let mut next_vr = 0;
let mut next_stack: u64 = 0;
let mut ret = vec![];

Expand All @@ -206,21 +240,27 @@ impl ABIMachineSpec for S390xMachineDeps {

let intreg = in_int_reg(param.value_type);
let fltreg = in_flt_reg(param.value_type);
debug_assert!(intreg || fltreg);
debug_assert!(!(intreg && fltreg));
let vecreg = in_vec_reg(param.value_type);
debug_assert!(intreg as i32 + fltreg as i32 + vecreg as i32 == 1);

let (next_reg, candidate) = if intreg {
let candidate = match args_or_rets {
ArgsOrRets::Args => get_intreg_for_arg(next_gpr),
ArgsOrRets::Rets => get_intreg_for_ret(next_gpr),
};
(&mut next_gpr, candidate)
} else {
} else if fltreg {
let candidate = match args_or_rets {
ArgsOrRets::Args => get_fltreg_for_arg(next_fpr),
ArgsOrRets::Rets => get_fltreg_for_ret(next_fpr),
};
(&mut next_fpr, candidate)
} else {
let candidate = match args_or_rets {
ArgsOrRets::Args => get_vecreg_for_arg(next_vr),
ArgsOrRets::Rets => get_vecreg_for_ret(next_vr),
};
(&mut next_vr, candidate)
};

// In the Wasmtime ABI only the first return value can be in a register.
Expand Down Expand Up @@ -252,7 +292,8 @@ impl ABIMachineSpec for S390xMachineDeps {

// Align the stack slot.
debug_assert!(slot_size.is_power_of_two());
next_stack = align_to(next_stack, slot_size);
let slot_align = std::cmp::min(slot_size, 8);
next_stack = align_to(next_stack, slot_align);

// If the type is actually of smaller size (and the argument
// was not extended), it is passed right-aligned.
Expand Down Expand Up @@ -477,6 +518,13 @@ impl ABIMachineSpec for S390xMachineDeps {
RegClass::Float => clobbered_fpr.push(reg),
}
}
// We need to save the link register in non-leaf functions.
// FIXME: This should be included in the clobber list to begin with,
// but isn't because we have have excluded call instructions via the
// is_included_in_clobbers callback.
if outgoing_args_size > 0 {
clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14))));
}

let mut first_clobbered_gpr = 16;
for reg in clobbered_gpr {
Expand Down Expand Up @@ -534,13 +582,15 @@ impl ABIMachineSpec for S390xMachineDeps {

// Save FPRs.
for (i, reg) in clobbered_fpr.iter().enumerate() {
insts.push(Inst::FpuStore64 {
insts.push(Inst::VecStoreLane {
size: 64,
rd: reg.to_reg().into(),
mem: MemArg::reg_plus_off(
stack_reg(),
(i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
MemFlags::trusted(),
),
lane_imm: 0,
});
if flags.unwind_info() {
insts.push(Inst::Unwind {
Expand All @@ -566,7 +616,14 @@ impl ABIMachineSpec for S390xMachineDeps {
let mut insts = SmallVec::new();

// Collect clobbered registers.
let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
let (mut clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
// We need to restore the link register in non-leaf functions.
// FIXME: This should be included in the clobber list to begin with,
// but isn't because we have have excluded call instructions via the
// is_included_in_clobbers callback.
if outgoing_args_size > 0 {
clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14))));
}
let mut first_clobbered_gpr = 16;
for reg in clobbered_gpr {
let enc = reg.to_reg().hw_enc();
Expand All @@ -578,13 +635,15 @@ impl ABIMachineSpec for S390xMachineDeps {

// Restore FPRs.
for (i, reg) in clobbered_fpr.iter().enumerate() {
insts.push(Inst::FpuLoad64 {
insts.push(Inst::VecLoadLaneUndef {
size: 64,
rd: Writable::from_reg(reg.to_reg().into()),
mem: MemArg::reg_plus_off(
stack_reg(),
(i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
MemFlags::trusted(),
),
lane_imm: 0,
});
}

Expand Down Expand Up @@ -639,7 +698,7 @@ impl ABIMachineSpec for S390xMachineDeps {
// We allocate in terms of 8-byte slots.
match rc {
RegClass::Int => 1,
RegClass::Float => 1,
RegClass::Float => 2,
}
}

Expand Down Expand Up @@ -739,6 +798,21 @@ const fn clobbers() -> PRegSet {
.with(gpr_preg(3))
.with(gpr_preg(4))
.with(gpr_preg(5))
// v0 - v7 inclusive and v16 - v31 inclusive are
// caller-saves. The upper 64 bits of v8 - v15 inclusive are
// also caller-saves. However, because we cannot currently
// represent partial registers to regalloc2, we indicate here
// that every vector register is caller-save. Because this
// function is used at *callsites*, approximating in this
// direction (save more than necessary) is conservative and
// thus safe.
//
// Note that we exclude clobbers from a call instruction when
// a call instruction's callee has the same ABI as the caller
// (the current function body); this is safe (anything
// clobbered by callee can be clobbered by caller as well) and
// avoids unnecessary saves of v8-v15 in the prologue even
// though we include them as defs here.
.with(vr_preg(0))
.with(vr_preg(1))
.with(vr_preg(2))
Expand All @@ -747,6 +821,14 @@ const fn clobbers() -> PRegSet {
.with(vr_preg(5))
.with(vr_preg(6))
.with(vr_preg(7))
.with(vr_preg(8))
.with(vr_preg(9))
.with(vr_preg(10))
.with(vr_preg(11))
.with(vr_preg(12))
.with(vr_preg(13))
.with(vr_preg(14))
.with(vr_preg(15))
.with(vr_preg(16))
.with(vr_preg(17))
.with(vr_preg(18))
Expand Down
Loading