From fe6ddcb856dd2991abff7eb676751d574a536a57 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Sat, 9 Jul 2022 19:05:15 +0200 Subject: [PATCH] s390x: Implement full SIMD support This adds full support for all Cranelift SIMD instructions to the s390x target. Everything is matched fully via ISLE. In addition to adding support for many new instructions, and the lower.isle code to match all SIMD IR patterns, this patch also adds ABI support for vector types. In particular, we now need to handle the fact that vector registers 8 .. 15 are partially callee-saved, i.e. the high parts of those registers (which correspond to the old floating-poing registers) are callee-saved, but the low parts are not. This is the exact same situation that we already have on AArch64, and so this patch uses the same solution (the is_included_in_clobbers callback). The bulk of the changes are platform-specific, but there are a few exceptions: - Added ISLE extractors for the Immediate and Constant types, to enable matching the vconst and swizzle instructions. - Added a missing accessor for call_conv to ABISig. - Fixed endian conversion for vector types in data_value.rs to enable their use in runtests on the big-endian platforms. - Enabled (nearly) all SIMD runtests on s390x. [ Two test cases remain disabled due to vector shift count semantics, see below. ] - Enabled all Wasmtime SIMD tests on s390x. There are three minor issues, called out via FIXMEs below, which should be addressed in the future, but should not be blockers to getting this patch merged. I've opened the following issues to track them: - Vector shift count semantics https://github.com/bytecodealliance/wasmtime/issues/4424 - is_included_in_clobbers vs. link register https://github.com/bytecodealliance/wasmtime/issues/4425 - gen_constant callback https://github.com/bytecodealliance/wasmtime/issues/4426 All tests, including all newly enabled SIMD tests, pass on both z14 and z15 architectures. --- build.rs | 6 +- cranelift/codegen/src/data_value.rs | 4 +- cranelift/codegen/src/isa/s390x/abi.rs | 98 +- cranelift/codegen/src/isa/s390x/inst.isle | 1412 ++++- cranelift/codegen/src/isa/s390x/inst/emit.rs | 979 +++- .../codegen/src/isa/s390x/inst/emit_tests.rs | 4765 +++++++++++++++-- cranelift/codegen/src/isa/s390x/inst/mod.rs | 1057 +++- cranelift/codegen/src/isa/s390x/lower.isle | 1432 ++++- cranelift/codegen/src/isa/s390x/lower.rs | 86 +- cranelift/codegen/src/isa/s390x/lower/isle.rs | 131 +- cranelift/codegen/src/machinst/abi_impl.rs | 5 + cranelift/codegen/src/machinst/isle.rs | 15 +- cranelift/codegen/src/machinst/lower.rs | 10 +- cranelift/codegen/src/prelude.isle | 11 + .../filetests/isa/s390x/condops.clif | 15 + .../filetests/isa/s390x/floating-point.clif | 40 + .../filetests/isa/s390x/fpmem-arch13.clif | 8 +- .../filetests/isa/s390x/vec-arithmetic.clif | 824 +++ .../filetests/isa/s390x/vec-bitops.clif | 43 + .../filetests/isa/s390x/vec-bitwise.clif | 364 ++ .../filetests/isa/s390x/vec-constants.clif | 213 + .../filetests/isa/s390x/vec-conversions.clif | 222 + .../filetests/isa/s390x/vec-fcmp.clif | 309 ++ .../filetests/isa/s390x/vec-fp-arch13.clif | 90 + .../filetests/filetests/isa/s390x/vec-fp.clif | 533 ++ .../filetests/isa/s390x/vec-icmp.clif | 423 ++ .../filetests/isa/s390x/vec-lane-arch13.clif | 807 +++ .../filetests/isa/s390x/vec-lane.clif | 1964 +++++++ .../filetests/isa/s390x/vec-logical.clif | 675 +++ .../filetests/isa/s390x/vec-permute.clif | 493 ++ .../filetests/isa/s390x/vec-shift-rotate.clif | 427 ++ .../filetests/isa/s390x/vecmem-arch13.clif | 375 ++ .../filetests/filetests/isa/s390x/vecmem.clif | 463 ++ .../filetests/runtests/fmax-pseudo.clif | 1 + .../runtests/fmin-max-pseudo-vector.clif | 2 +- .../filetests/runtests/fmin-pseudo.clif | 1 + .../runtests/shifts-small-types.clif | 3 +- ...d-arithmetic-nondeterministic-aarch64.clif | 1 + .../filetests/runtests/simd-arithmetic.clif | 2 +- .../runtests/simd-bitselect-to-vselect.clif | 2 +- .../filetests/runtests/simd-bitselect.clif | 1 + .../filetests/runtests/simd-bitwise-run.clif | 1 + .../filetests/runtests/simd-bitwise.clif | 2 +- .../filetests/runtests/simd-comparison.clif | 2 +- .../filetests/runtests/simd-conversion.clif | 2 +- .../filetests/runtests/simd-extractlane.clif | 1 + .../filetests/runtests/simd-iabs.clif | 1 + .../filetests/runtests/simd-iaddpairwise.clif | 1 + .../filetests/runtests/simd-insertlane.clif | 3 +- .../filetests/runtests/simd-lane-access.clif | 2 +- .../filetests/runtests/simd-logical.clif | 2 +- .../filetests/runtests/simd-min-max.clif | 1 + .../filetests/runtests/simd-saddsat.clif | 1 + .../filetests/runtests/simd-shuffle.clif | 1 + .../filetests/runtests/simd-snarrow.clif | 1 + .../filetests/runtests/simd-splat.clif | 1 + .../runtests/simd-sqmulroundsat-aarch64.clif | 1 + .../runtests/simd-sqmulroundsat.clif | 1 + .../filetests/runtests/simd-ssubsat.clif | 1 + .../filetests/runtests/simd-swidenhigh.clif | 1 + .../filetests/runtests/simd-swidenlow.clif | 1 + .../filetests/runtests/simd-swizzle.clif | 1 + .../filetests/runtests/simd-uaddsat.clif | 1 + .../filetests/runtests/simd-unarrow.clif | 1 + .../filetests/runtests/simd-usubsat.clif | 1 + .../filetests/runtests/simd-uunarrow.clif | 1 + .../filetests/runtests/simd-uwidenhigh.clif | 1 + .../filetests/runtests/simd-uwidenlow.clif | 1 + .../filetests/runtests/simd-valltrue.clif | 1 + .../filetests/runtests/simd-vanytrue.clif | 1 + .../filetests/runtests/simd-vconst.clif | 2 +- .../filetests/runtests/simd-vhighbits.clif | 1 + .../filetests/runtests/simd-vselect.clif | 2 +- .../simd-wideningpairwisedotproducts.clif | 1 + .../filetests/runtests/simd_compare_zero.clif | 1 + 75 files changed, 17225 insertions(+), 1130 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-bitops.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-constants.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-conversions.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-fp.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-icmp.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-lane.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-logical.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-permute.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vecmem.clif diff --git a/build.rs b/build.rs index 4918c3466e43..18e4c80e7ca4 100644 --- a/build.rs +++ b/build.rs @@ -171,9 +171,9 @@ fn write_testsuite_tests( fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { match strategy { "Cranelift" => match (testsuite, testname) { - // No simd support yet for s390x. - ("simd", _) if platform_is_s390x() => return true, - _ if platform_is_s390x() && testname.starts_with("simd") => return true, + // FIXME: These tests fail under qemu due to a qemu bug. + (_, "simd_f32x4_pmin_pmax") if platform_is_s390x() => return true, + (_, "simd_f64x2_pmin_pmax") if platform_is_s390x() => return true, _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/src/data_value.rs b/cranelift/codegen/src/data_value.rs index 13aa23767499..e2b6d5aba395 100644 --- a/cranelift/codegen/src/data_value.rs +++ b/cranelift/codegen/src/data_value.rs @@ -89,7 +89,7 @@ impl DataValue { DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]), DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]), DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]), - DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]), + DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()), _ => unimplemented!(), }; } @@ -120,7 +120,7 @@ impl DataValue { DataValue::B(src[..size].iter().any(|&i| i != 0)) } _ if ty.is_vector() && ty.bytes() == 16 => { - DataValue::V128(src[..16].try_into().unwrap()) + DataValue::V128(u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes()) } _ => unimplemented!(), } diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index 77dcc87e9421..52db56de435c 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -97,6 +97,10 @@ fn in_flt_reg(ty: Type) -> bool { } } +fn in_vec_reg(ty: Type) -> bool { + ty.is_vector() && ty.bits() == 128 +} + fn get_intreg_for_arg(idx: usize) -> Option { match idx { 0 => Some(regs::gpr(2)), @@ -118,6 +122,20 @@ fn get_fltreg_for_arg(idx: usize) -> Option { } } +fn get_vecreg_for_arg(idx: usize) -> Option { + match idx { + 0 => Some(regs::vr(24)), + 1 => Some(regs::vr(25)), + 2 => Some(regs::vr(26)), + 3 => Some(regs::vr(27)), + 4 => Some(regs::vr(28)), + 5 => Some(regs::vr(29)), + 6 => Some(regs::vr(30)), + 7 => Some(regs::vr(31)), + _ => None, + } +} + fn get_intreg_for_ret(idx: usize) -> Option { match idx { 0 => Some(regs::gpr(2)), @@ -140,6 +158,21 @@ fn get_fltreg_for_ret(idx: usize) -> Option { } } +fn get_vecreg_for_ret(idx: usize) -> Option { + match idx { + 0 => Some(regs::vr(24)), + // ABI extension to support multi-value returns: + 1 => Some(regs::vr(25)), + 2 => Some(regs::vr(26)), + 3 => Some(regs::vr(27)), + 4 => Some(regs::vr(28)), + 5 => Some(regs::vr(29)), + 6 => Some(regs::vr(30)), + 7 => Some(regs::vr(31)), + _ => None, + } +} + /// This is the limit for the size of argument and return-value areas on the /// stack. We place a reasonable limit here to avoid integer overflow issues /// with 32-bit arithmetic: for now, 128 MB. @@ -182,6 +215,7 @@ impl ABIMachineSpec for S390xMachineDeps { ) -> CodegenResult<(Vec, i64, Option)> { let mut next_gpr = 0; let mut next_fpr = 0; + let mut next_vr = 0; let mut next_stack: u64 = 0; let mut ret = vec![]; @@ -206,8 +240,8 @@ impl ABIMachineSpec for S390xMachineDeps { let intreg = in_int_reg(param.value_type); let fltreg = in_flt_reg(param.value_type); - debug_assert!(intreg || fltreg); - debug_assert!(!(intreg && fltreg)); + let vecreg = in_vec_reg(param.value_type); + debug_assert!(intreg as i32 + fltreg as i32 + vecreg as i32 == 1); let (next_reg, candidate) = if intreg { let candidate = match args_or_rets { @@ -215,12 +249,18 @@ impl ABIMachineSpec for S390xMachineDeps { ArgsOrRets::Rets => get_intreg_for_ret(next_gpr), }; (&mut next_gpr, candidate) - } else { + } else if fltreg { let candidate = match args_or_rets { ArgsOrRets::Args => get_fltreg_for_arg(next_fpr), ArgsOrRets::Rets => get_fltreg_for_ret(next_fpr), }; (&mut next_fpr, candidate) + } else { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_vecreg_for_arg(next_vr), + ArgsOrRets::Rets => get_vecreg_for_ret(next_vr), + }; + (&mut next_vr, candidate) }; // In the Wasmtime ABI only the first return value can be in a register. @@ -252,7 +292,8 @@ impl ABIMachineSpec for S390xMachineDeps { // Align the stack slot. debug_assert!(slot_size.is_power_of_two()); - next_stack = align_to(next_stack, slot_size); + let slot_align = std::cmp::min(slot_size, 8); + next_stack = align_to(next_stack, slot_align); // If the type is actually of smaller size (and the argument // was not extended), it is passed right-aligned. @@ -477,6 +518,13 @@ impl ABIMachineSpec for S390xMachineDeps { RegClass::Float => clobbered_fpr.push(reg), } } + // We need to save the link register in non-leaf functions. + // FIXME: This should be included in the clobber list to begin with, + // but isn't because we have have excluded call instructions via the + // is_included_in_clobbers callback. + if outgoing_args_size > 0 { + clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14)))); + } let mut first_clobbered_gpr = 16; for reg in clobbered_gpr { @@ -534,13 +582,15 @@ impl ABIMachineSpec for S390xMachineDeps { // Save FPRs. for (i, reg) in clobbered_fpr.iter().enumerate() { - insts.push(Inst::FpuStore64 { + insts.push(Inst::VecStoreLane { + size: 64, rd: reg.to_reg().into(), mem: MemArg::reg_plus_off( stack_reg(), (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64, MemFlags::trusted(), ), + lane_imm: 0, }); if flags.unwind_info() { insts.push(Inst::Unwind { @@ -566,7 +616,14 @@ impl ABIMachineSpec for S390xMachineDeps { let mut insts = SmallVec::new(); // Collect clobbered registers. - let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers); + let (mut clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers); + // We need to restore the link register in non-leaf functions. + // FIXME: This should be included in the clobber list to begin with, + // but isn't because we have have excluded call instructions via the + // is_included_in_clobbers callback. + if outgoing_args_size > 0 { + clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14)))); + } let mut first_clobbered_gpr = 16; for reg in clobbered_gpr { let enc = reg.to_reg().hw_enc(); @@ -578,13 +635,15 @@ impl ABIMachineSpec for S390xMachineDeps { // Restore FPRs. for (i, reg) in clobbered_fpr.iter().enumerate() { - insts.push(Inst::FpuLoad64 { + insts.push(Inst::VecLoadLaneUndef { + size: 64, rd: Writable::from_reg(reg.to_reg().into()), mem: MemArg::reg_plus_off( stack_reg(), (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64, MemFlags::trusted(), ), + lane_imm: 0, }); } @@ -639,7 +698,7 @@ impl ABIMachineSpec for S390xMachineDeps { // We allocate in terms of 8-byte slots. match rc { RegClass::Int => 1, - RegClass::Float => 1, + RegClass::Float => 2, } } @@ -739,6 +798,21 @@ const fn clobbers() -> PRegSet { .with(gpr_preg(3)) .with(gpr_preg(4)) .with(gpr_preg(5)) + // v0 - v7 inclusive and v16 - v31 inclusive are + // caller-saves. The upper 64 bits of v8 - v15 inclusive are + // also caller-saves. However, because we cannot currently + // represent partial registers to regalloc2, we indicate here + // that every vector register is caller-save. Because this + // function is used at *callsites*, approximating in this + // direction (save more than necessary) is conservative and + // thus safe. + // + // Note that we exclude clobbers from a call instruction when + // a call instruction's callee has the same ABI as the caller + // (the current function body); this is safe (anything + // clobbered by callee can be clobbered by caller as well) and + // avoids unnecessary saves of v8-v15 in the prologue even + // though we include them as defs here. .with(vr_preg(0)) .with(vr_preg(1)) .with(vr_preg(2)) @@ -747,6 +821,14 @@ const fn clobbers() -> PRegSet { .with(vr_preg(5)) .with(vr_preg(6)) .with(vr_preg(7)) + .with(vr_preg(8)) + .with(vr_preg(9)) + .with(vr_preg(10)) + .with(vr_preg(11)) + .with(vr_preg(12)) + .with(vr_preg(13)) + .with(vr_preg(14)) + .with(vr_preg(15)) .with(vr_preg(16)) .with(vr_preg(17)) .with(vr_preg(18)) diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index 00868224c544..38c90e4711da 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -467,26 +467,6 @@ (cond Cond) (rm Reg)) - ;; A 32-bit move instruction from GPR to FPR or vector element. - (MovToFpr32 - (rd WritableReg) - (rn Reg)) - - ;; A 64-bit move instruction from GPR to FPR or vector element. - (MovToFpr64 - (rd WritableReg) - (rn Reg)) - - ;; A 32-bit move instruction from FPR or vector element to GPR. - (MovFromFpr32 - (rd WritableReg) - (rn Reg)) - - ;; A 64-bit move instruction from FPR or vector element to GPR. - (MovFromFpr64 - (rd WritableReg) - (rn Reg)) - ;; 1-op FPU instruction implemented as vector instruction with the W bit. (FpuRR (fpu_op FPUOp1) @@ -508,6 +488,13 @@ (rm Reg) (ra Reg)) + ;; 1-op FPU instruction with rounding mode. + (FpuRound + (op FpuRoundOp) + (mode FpuRoundMode) + (rd WritableReg) + (rn Reg)) + ;; FPU comparison, single-precision (32 bit). (FpuCmp32 (rn Reg) @@ -518,69 +505,255 @@ (rn Reg) (rm Reg)) - ;; Floating-point load, single-precision (32 bit). - (FpuLoad32 + ;; Load floating-point constant, single-precision (32 bit). + (LoadFpuConst32 (rd WritableReg) - (mem MemArg)) + (const_data u32)) - ;; Floating-point store, single-precision (32 bit). - (FpuStore32 - (rd Reg) - (mem MemArg)) + ;; Load floating-point constant, double-precision (64 bit). + (LoadFpuConst64 + (rd WritableReg) + (const_data u64)) - ;; Floating-point load, double-precision (64 bit). - (FpuLoad64 + ;; A binary vector operation with two vector register sources. + (VecRRR + (op VecBinaryOp) (rd WritableReg) - (mem MemArg)) + (rn Reg) + (rm Reg)) - ;; Floating-point store, double-precision (64 bit). - (FpuStore64 - (rd Reg) + ;; A unary vector operation with a vector register source. + (VecRR + (op VecUnaryOp) + (rd WritableReg) + (rn Reg)) + + ;; Vector shift instruction with a register source, a register destination, + ;; and an immediate plus an optional register as shift count. + (VecShiftRR + (shift_op VecShiftOp) + (rd WritableReg) + (rn Reg) + (shift_imm u8) + (shift_reg Reg)) + + ;; Vector select instruction. + (VecSelect + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + + ;; Vector permute instruction. + (VecPermute + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + + ;; Vector permute doubleword immediate instruction. + (VecPermuteDWImm + (rd WritableReg) + (rn Reg) + (rm Reg) + (idx1 u8) + (idx2 u8)) + + ;; Vector integer comparison with two register sources and a register + ;; destination. + (VecIntCmp + (op VecIntCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Same, but also set the condition code. + (VecIntCmpS + (op VecIntCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Vector floating-point comparison with two register sources and a register + ;; destination. + (VecFloatCmp + (op VecFloatCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Same, but also set the condition code. + (VecFloatCmpS + (op VecFloatCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; 128-bit vector load instruction. + (VecLoad + (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed load, single-precision (32 bit). - (FpuLoadRev32 + ;; 128-bit byte-reversed vector load instruction. + (VecLoadRev (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed store, single-precision (32 bit). - (FpuStoreRev32 + ;; 128-bit vector store instruction. + (VecStore + (rd Reg) + (mem MemArg)) + + ;; 128-bit byte-reversed vector store instruction. + (VecStoreRev (rd Reg) (mem MemArg)) - ;; Floating-point byte-reversed load, double-precision (64 bit). - (FpuLoadRev64 + ;; 128-bit vector load replicated element instruction. + (VecLoadReplicate + (size u32) (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed store, double-precision (64 bit). - (FpuStoreRev64 - (rd Reg) + ;; 128-bit byte-reversed vector load replicated element instruction. + (VecLoadReplicateRev + (size u32) + (rd WritableReg) (mem MemArg)) - ;; Load floating-point constant, single-precision (32 bit). - (LoadFpuConst32 + ;; Vector move instruction. + (VecMov (rd WritableReg) - (const_data u32)) + (rn Reg)) - ;; Load floating-point constant, double-precision (64 bit). - (LoadFpuConst64 + ;; Conditional vector move instruction. + (VecCMov + (rd WritableReg) + (cond Cond) + (rm Reg)) + + ;; A 128-bit move instruction from two GPRs to a VR. + (MovToVec128 + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Load 128-bit (big-endian) vector constant. + (VecLoadConst + (rd WritableReg) + (const_data u128)) + + ;; Load 128-bit (big-endian) replicated vector constant. + (VecLoadConstReplicate + (size u32) (rd WritableReg) (const_data u64)) - ;; 1-op FPU instruction with rounding mode. - (FpuRound - (op FpuRoundOp) - (mode FpuRoundMode) + ;; Load vector immediate generated via byte mask. + (VecImmByteMask (rd WritableReg) - (rn Reg)) + (mask u16)) - ;; Vector select instruction. - (VecSelect + ;; Load vector replicated contiguous bit mask. + (VecImmBitMask + (size u32) + (rd WritableReg) + (start_bit u8) + (end_bit u8)) + + ;; Load vector replicated immediate. + (VecImmReplicate + (size u32) + (rd WritableReg) + (imm i16)) + + ;; Vector lane insertion with an in/out VR, a memory source, + ;; and an immediate as lane index. + (VecLoadLane + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Same as VecLoadLane, but allow undefined input VR. + (VecLoadLaneUndef + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Byte-reversed vector lane insertion with an in/out VR, a memory source, + ;; and an immediate as lane index. + (VecLoadLaneRev + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Same as VecLoadLaneRev, but allow undefined input VR. + (VecLoadLaneRevUndef + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Vector lane extraction with a memory destination, a VR source, + ;; and an immediate as lane index. + (VecStoreLane + (size u32) + (rd Reg) + (mem MemArg) + (lane_imm u8)) + + ;; Byte-reversed vector lane extraction with a memory destination, a VR source, + ;; and an immediate as lane index. + (VecStoreLaneRev + (size u32) + (rd Reg) + (mem MemArg) + (lane_imm u8)) + + ;; Vector lane insertion with an in/out VR, a GPR source, + ;; and an immediate plus an optional register as lane index. + (VecInsertLane + (size u32) (rd WritableReg) (rn Reg) - (rm Reg) - (ra Reg)) + (lane_imm u8) + (lane_reg Reg)) + + ;; Same as VecInsertLane, but allow undefined input VR. + (VecInsertLaneUndef + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8) + (lane_reg Reg)) + + ;; Vector lane extraction with a VR source, a GPR destination, + ;; and an immediate plus an optional register as lane index. + (VecExtractLane + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8) + (lane_reg Reg)) + + ;; Vector lane insertion with an in/out VR, an immediate source, + ;; and an immediate as lane index. + (VecInsertLaneImm + (size u32) + (rd WritableReg) + (imm i16) + (lane_imm u8)) + + ;; Vector lane replication with a VR source, a VR destination, + ;; and an immediate as lane index. + (VecReplicateLane + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8)) ;; A machine call instruction. (Call @@ -807,18 +980,208 @@ (CmpL64Ext32) )) +;; A binary vector operation. +(type VecBinaryOp + (enum + ;; Addition and subtraction + (Add8x16) + (Add16x8) + (Add32x4) + (Add64x2) + (Sub8x16) + (Sub16x8) + (Sub32x4) + (Sub64x2) + ;; Multiplication (64-bit not supported) + (Mul8x16) + (Mul16x8) + (Mul32x4) + (UMulHi8x16) + (UMulHi16x8) + (UMulHi32x4) + (SMulHi8x16) + (SMulHi16x8) + (SMulHi32x4) + (UMulEven8x16) + (UMulEven16x8) + (UMulEven32x4) + (SMulEven8x16) + (SMulEven16x8) + (SMulEven32x4) + (UMulOdd8x16) + (UMulOdd16x8) + (UMulOdd32x4) + (SMulOdd8x16) + (SMulOdd16x8) + (SMulOdd32x4) + ;; Minimum, maximum, and average + (UMax8x16) + (UMax16x8) + (UMax32x4) + (UMax64x2) + (SMax8x16) + (SMax16x8) + (SMax32x4) + (SMax64x2) + (UMin8x16) + (UMin16x8) + (UMin32x4) + (UMin64x2) + (SMin8x16) + (SMin16x8) + (SMin32x4) + (SMin64x2) + (UAvg8x16) + (UAvg16x8) + (UAvg32x4) + (UAvg64x2) + (SAvg8x16) + (SAvg16x8) + (SAvg32x4) + (SAvg64x2) + ;; Bitwise operations + (And128) + (Orr128) + (Xor128) + (NotAnd128) + (NotOrr128) + (NotXor128) + (AndNot128) + (OrrNot128) + ;; Bit permute + (BitPermute128) + ;; Full vector shift operations + (LShLByByte128) + (LShRByByte128) + (AShRByByte128) + (LShLByBit128) + (LShRByBit128) + (AShRByBit128) + ;; Pack + (Pack16x8) + (Pack32x4) + (Pack64x2) + ;; Pack saturate (unsigned) + (PackUSat16x8) + (PackUSat32x4) + (PackUSat64x2) + ;; Pack saturate (signed) + (PackSSat16x8) + (PackSSat32x4) + (PackSSat64x2) + ;; Merge + (MergeLow8x16) + (MergeLow16x8) + (MergeLow32x4) + (MergeLow64x2) + (MergeHigh8x16) + (MergeHigh16x8) + (MergeHigh32x4) + (MergeHigh64x2) +)) + +;; A vector unary operation. +(type VecUnaryOp + (enum + ;; Sign operations + (Abs8x16) + (Abs16x8) + (Abs32x4) + (Abs64x2) + (Neg8x16) + (Neg16x8) + (Neg32x4) + (Neg64x2) + ;; Population count + (Popcnt8x16) + (Popcnt16x8) + (Popcnt32x4) + (Popcnt64x2) + ;; Unpack + (UnpackULow8x16) + (UnpackULow16x8) + (UnpackULow32x4) + (UnpackUHigh8x16) + (UnpackUHigh16x8) + (UnpackUHigh32x4) + (UnpackSLow8x16) + (UnpackSLow16x8) + (UnpackSLow32x4) + (UnpackSHigh8x16) + (UnpackSHigh16x8) + (UnpackSHigh32x4) +)) + +;; A vector shift operation. +(type VecShiftOp + (enum + (RotL8x16) + (RotL16x8) + (RotL32x4) + (RotL64x2) + (LShL8x16) + (LShL16x8) + (LShL32x4) + (LShL64x2) + (LShR8x16) + (LShR16x8) + (LShR32x4) + (LShR64x2) + (AShR8x16) + (AShR16x8) + (AShR32x4) + (AShR64x2) +)) + +;; An integer vector comparison operation. +(type VecIntCmpOp + (enum + (CmpEq8x16) + (CmpEq16x8) + (CmpEq32x4) + (CmpEq64x2) + (SCmpHi8x16) + (SCmpHi16x8) + (SCmpHi32x4) + (SCmpHi64x2) + (UCmpHi8x16) + (UCmpHi16x8) + (UCmpHi32x4) + (UCmpHi64x2) +)) + +;; A floatint-point vector comparison operation. +(type VecFloatCmpOp + (enum + (CmpEq32x4) + (CmpEq64x2) + (CmpHi32x4) + (CmpHi64x2) + (CmpHiEq32x4) + (CmpHiEq64x2) +)) + ;; A floating-point unit (FPU) operation with one arg. (type FPUOp1 (enum (Abs32) (Abs64) + (Abs32x4) + (Abs64x2) (Neg32) (Neg64) + (Neg32x4) + (Neg64x2) (NegAbs32) (NegAbs64) + (NegAbs32x4) + (NegAbs64x2) (Sqrt32) (Sqrt64) + (Sqrt32x4) + (Sqrt64x2) (Cvt32To64) + (Cvt32x4To64x2) )) ;; A floating-point unit (FPU) operation with two args. @@ -826,16 +1189,36 @@ (enum (Add32) (Add64) + (Add32x4) + (Add64x2) (Sub32) (Sub64) + (Sub32x4) + (Sub64x2) (Mul32) (Mul64) + (Mul32x4) + (Mul64x2) (Div32) (Div64) + (Div32x4) + (Div64x2) (Max32) (Max64) + (Max32x4) + (Max64x2) (Min32) (Min64) + (Min32x4) + (Min64x2) + (MaxPseudo32) + (MaxPseudo64) + (MaxPseudo32x4) + (MaxPseudo64x2) + (MinPseudo32) + (MinPseudo64) + (MinPseudo32x4) + (MinPseudo64x2) )) ;; A floating-point unit (FPU) operation with three args. @@ -843,24 +1226,39 @@ (enum (MAdd32) (MAdd64) + (MAdd32x4) + (MAdd64x2) (MSub32) (MSub64) + (MSub32x4) + (MSub64x2) )) ;; A floating-point unit (FPU) operation with one arg, and rounding mode. (type FpuRoundOp (enum (Cvt64To32) + (Cvt64x2To32x4) (Round32) (Round64) + (Round32x4) + (Round64x2) (ToSInt32) (ToSInt64) (ToUInt32) (ToUInt64) + (ToSInt32x4) + (ToSInt64x2) + (ToUInt32x4) + (ToUInt64x2) (FromSInt32) (FromSInt64) (FromUInt32) (FromUInt64) + (FromSInt32x4) + (FromSInt64x2) + (FromUInt32x4) + (FromUInt64x2) )) ;; Rounding modes for floating-point ops. @@ -949,6 +1347,55 @@ (decl u64_as_i16 (u64) i16) (extern constructor u64_as_i16 u64_as_i16) +;; Construct and extract immediate vector constants. + +(decl u64_pair (u64 u64) u128) +(extern constructor u64_pair u64_pair_concat) +(extern extractor infallible u64_pair u64_pair_split) + +(decl u32_pair (u32 u32) u64) +(extern constructor u32_pair u32_pair_concat) +(extern extractor infallible u32_pair u32_pair_split) + +(decl u16_pair (u16 u16) u32) +(extern constructor u16_pair u16_pair_concat) +(extern extractor infallible u16_pair u16_pair_split) + +(decl u8_pair (u8 u8) u16) +(extern constructor u8_pair u8_pair_concat) +(extern extractor infallible u8_pair u8_pair_split) + +(decl imm8x16 (u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8) u128) +(extractor (imm8x16 a b c d e f g h i j k l m n o p) + (u64_pair (u32_pair (u16_pair (u8_pair a b) (u8_pair c d)) + (u16_pair (u8_pair e f) (u8_pair g h))) + (u32_pair (u16_pair (u8_pair i j) (u8_pair k l)) + (u16_pair (u8_pair m n) (u8_pair o p))))) +(rule (imm8x16 a b c d e f g h i j k l m n o p) + (u64_pair (u32_pair (u16_pair (u8_pair a b) (u8_pair c d)) + (u16_pair (u8_pair e f) (u8_pair g h))) + (u32_pair (u16_pair (u8_pair i j) (u8_pair k l)) + (u16_pair (u8_pair m n) (u8_pair o p))))) + +;; Convert a little-endian lane index to a big-endian lane index. + +(decl be_lane_idx (Type u8) u8) +(extern constructor be_lane_idx be_lane_idx) + +;; Construct a VGBM mask to set all bits in one lane of a vector. + +(decl lane_byte_mask (Type u8) u16) +(extern constructor lane_byte_mask lane_byte_mask) + +;; Extract "permute" and "and" masks from a shuffle constant + +(decl shuffle_mask_from_u128 (u128 u16) u128) +(extern extractor infallible shuffle_mask_from_u128 shuffle_mask_from_u128) + +(decl shuffle_mask (u128 u16) Immediate) +(extractor (shuffle_mask permute_mask and_mask) + (u128_from_immediate (shuffle_mask_from_u128 permute_mask and_mask))) + ;; Split an u64 into high and low parts. (decl u64_nonzero_hipart (u64) u64) @@ -965,6 +1412,9 @@ (decl i16_from_u64 (i16) u64) (extern extractor i16_from_u64 i16_from_u64) +(decl i16_from_u32 (i16) u32) +(extern extractor i16_from_u32 i16_from_u32) + (decl uimm32shifted_from_u64 (UImm32Shifted) u64) (extern extractor uimm32shifted_from_u64 uimm32shifted_from_u64) @@ -985,6 +1435,9 @@ (decl u64_from_signed_value (u64) Value) (extern extractor u64_from_signed_value u64_from_signed_value) +(decl u64_from_inverted_value (u64) Value) +(extern extractor u64_from_inverted_value u64_from_inverted_value) + (decl i64_from_value (i64) Value) (extern extractor i64_from_value i64_from_value) @@ -1097,10 +1550,10 @@ (type MemArg extern (enum)) -(decl memarg_reg_plus_reg (Reg Reg MemFlags) MemArg) +(decl memarg_reg_plus_reg (Reg Reg u8 MemFlags) MemArg) (extern constructor memarg_reg_plus_reg memarg_reg_plus_reg) -(decl memarg_reg_plus_off (Reg i64 MemFlags) MemArg) +(decl memarg_reg_plus_off (Reg i64 u8 MemFlags) MemArg) (extern constructor memarg_reg_plus_off memarg_reg_plus_off) (decl memarg_symbol (ExternalName i32 MemFlags) MemArg) @@ -1126,10 +1579,10 @@ (decl lower_address (MemFlags Value Offset32) MemArg) (rule (lower_address flags addr (i64_from_offset offset)) - (memarg_reg_plus_off addr offset flags)) + (memarg_reg_plus_off addr offset 0 flags)) (rule (lower_address flags (iadd x y) (i64_from_offset 0)) - (memarg_reg_plus_reg x y flags)) + (memarg_reg_plus_reg x y 0 flags)) (rule (lower_address flags (symbol_value (symbol_value_data name (reloc_distance_near) sym_offset)) @@ -1138,6 +1591,17 @@ (memarg_symbol name final_offset flags)) +;; Lower an address plus a small bias into a `MemArg`. + +(decl lower_address_bias (MemFlags Value Offset32 u8) MemArg) + +(rule (lower_address_bias flags addr (i64_from_offset offset) bias) + (memarg_reg_plus_off addr offset bias flags)) + +(rule (lower_address_bias flags (iadd x y) (i64_from_offset 0) bias) + (memarg_reg_plus_reg x y bias flags)) + + ;; Test whether a `load` address will be lowered to a `MemArg::Symbol`. (decl pure load_sym (Inst) Inst) @@ -1206,6 +1670,11 @@ (extractor (sinkable_load_16 inst) (and (value_type $I16) (sinkable_load inst))) +;; Sinkable little-endian load instruction. +(decl sinkable_load_little (Inst) Value) +(extractor (sinkable_load_little inst) + (sinkable_inst (and inst (load (littleendian) _addr _offset)))) + ;; Sinkable big-endian sload16 instruction. (decl sinkable_sload16 (Inst) Value) (extractor (sinkable_sload16 inst) @@ -1615,87 +2084,225 @@ (_ Unit (emit (MInst.FpuRound op mode dst src)))) dst)) -;; Helper for emitting `MInst.MovToFpr32` instructions. -(decl mov_to_fpr32 (Reg) Reg) -(rule (mov_to_fpr32 src) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.MovToFpr32 dst src)))) +;; Helper for emitting `MInst.VecRRR` instructions. +(decl vec_rrr (Type VecBinaryOp Reg Reg) Reg) +(rule (vec_rrr ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecRRR op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.MovToFpr64` instructions. -(decl mov_to_fpr64 (Reg) Reg) -(rule (mov_to_fpr64 src) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.MovToFpr64 dst src)))) +;; Helper for emitting `MInst.VecRR` instructions. +(decl vec_rr (Type VecUnaryOp Reg) Reg) +(rule (vec_rr ty op src) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecRR op dst src)))) dst)) -;; Helper for emitting `MInst.MovFromFpr32` instructions. -(decl mov_from_fpr32 (Reg) Reg) -(rule (mov_from_fpr32 src) - (let ((dst WritableReg (temp_writable_reg $I32)) - (_ Unit (emit (MInst.MovFromFpr32 dst src)))) +;; Helper for emitting `MInst.VecShiftRR` instructions. +(decl vec_shift_rr (Type VecShiftOp Reg u8 Reg) Reg) +(rule (vec_shift_rr ty op src shift_imm shift_reg) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecShiftRR op dst src shift_imm shift_reg)))) dst)) -;; Helper for emitting `MInst.MovFromFpr64` instructions. -(decl mov_from_fpr64 (Reg) Reg) -(rule (mov_from_fpr64 src) - (let ((dst WritableReg (temp_writable_reg $I64)) - (_ Unit (emit (MInst.MovFromFpr64 dst src)))) +;; Helper for emitting `MInst.VecSelect` instructions. +(decl vec_select (Type Reg Reg Reg) Reg) +(rule (vec_select ty src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecSelect dst src1 src2 src3)))) + dst)) + +;; Helper for emitting `MInst.VecPermute` instructions. +(decl vec_permute (Type Reg Reg Reg) Reg) +(rule (vec_permute ty src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecPermute dst src1 src2 src3)))) + dst)) + +;; Helper for emitting `MInst.VecPermuteDWImm` instructions. +(decl vec_permute_dw_imm (Type Reg u8 Reg u8) Reg) +(rule (vec_permute_dw_imm ty src1 idx1 src2 idx2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecPermuteDWImm dst src1 src2 idx1 idx2)))) dst)) -;; Helper for emitting `MInst.FpuLoad32` instructions. -(decl fpu_load32 (MemArg) Reg) -(rule (fpu_load32 addr) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.FpuLoad32 dst addr)))) +;; Helper for emitting `MInst.VecIntCmp` instructions. +(decl vec_int_cmp (Type VecIntCmpOp Reg Reg) Reg) +(rule (vec_int_cmp ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecIntCmp op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.FpuLoad64` instructions. -(decl fpu_load64 (MemArg) Reg) -(rule (fpu_load64 addr) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.FpuLoad64 dst addr)))) +;; Helper for emitting `MInst.VecIntCmpS` instructions. +(decl vec_int_cmps (Type VecIntCmpOp Reg Reg) ProducesFlags) +(rule (vec_int_cmps ty op src1 src2) + (let ((tmp WritableReg (temp_writable_reg ty))) + (ProducesFlags.ProducesFlagsSideEffect (MInst.VecIntCmpS op tmp src1 src2)))) + +;; Helper for emitting `MInst.VecFloatCmp` instructions. +(decl vec_float_cmp (Type VecFloatCmpOp Reg Reg) Reg) +(rule (vec_float_cmp ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecFloatCmp op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.FpuLoadRev32` instructions. -(decl fpu_loadrev32 (MemArg) Reg) -(rule (fpu_loadrev32 addr) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.FpuLoadRev32 dst addr)))) +;; Helper for emitting `MInst.VecFloatCmpS` instructions. +(decl vec_float_cmps (Type VecFloatCmpOp Reg Reg) ProducesFlags) +(rule (vec_float_cmps ty op src1 src2) + (let ((tmp WritableReg (temp_writable_reg ty))) + (ProducesFlags.ProducesFlagsSideEffect (MInst.VecFloatCmpS op tmp src1 src2)))) + +;; Helper for emitting `MInst.VecLoad` instructions. +(decl vec_load (Type MemArg) Reg) +(rule (vec_load ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoad dst addr)))) dst)) -;; Helper for emitting `MInst.FpuLoadRev64` instructions. -(decl fpu_loadrev64 (MemArg) Reg) -(rule (fpu_loadrev64 addr) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.FpuLoadRev64 dst addr)))) +;; Helper for emitting `MInst.VecLoadRev` instructions. +(decl vec_loadrev (Type MemArg) Reg) +(rule (vec_loadrev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadRev dst addr)))) dst)) -;; Helper for emitting `MInst.FpuStore32` instructions. -(decl fpu_store32 (Reg MemArg) SideEffectNoResult) -(rule (fpu_store32 src addr) - (SideEffectNoResult.Inst (MInst.FpuStore32 src addr))) +;; Helper for emitting `MInst.VecStore` instructions. +(decl vec_store (Reg MemArg) SideEffectNoResult) +(rule (vec_store src addr) + (SideEffectNoResult.Inst (MInst.VecStore src addr))) -;; Helper for emitting `MInst.FpuStore64` instructions. -(decl fpu_store64 (Reg MemArg) SideEffectNoResult) -(rule (fpu_store64 src addr) - (SideEffectNoResult.Inst (MInst.FpuStore64 src addr))) +;; Helper for emitting `MInst.VecStoreRev` instructions. +(decl vec_storerev (Reg MemArg) SideEffectNoResult) +(rule (vec_storerev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreRev src addr))) -;; Helper for emitting `MInst.FpuStoreRev32` instructions. -(decl fpu_storerev32 (Reg MemArg) SideEffectNoResult) -(rule (fpu_storerev32 src addr) - (SideEffectNoResult.Inst (MInst.FpuStoreRev32 src addr))) +;; Helper for emitting `MInst.VecLoadReplicate` instructions. +(decl vec_load_replicate (Type MemArg) Reg) +(rule (vec_load_replicate (ty_vec128 ty @ (multi_lane size _)) addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadReplicate size dst addr)))) + dst)) -;; Helper for emitting `MInst.FpuStoreRev64` instructions. -(decl fpu_storerev64 (Reg MemArg) SideEffectNoResult) -(rule (fpu_storerev64 src addr) - (SideEffectNoResult.Inst (MInst.FpuStoreRev64 src addr))) +;; Helper for emitting `MInst.VecLoadReplicateRev` instructions. +(decl vec_load_replicate_rev (Type MemArg) Reg) +(rule (vec_load_replicate_rev (ty_vec128 ty @ (multi_lane size _)) addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadReplicateRev size dst addr)))) + dst)) -;; Helper for emitting `MInst.VecSelect` instructions. -(decl vec_select (Type Reg Reg Reg) Reg) -(rule (vec_select ty src1 src2 src3) +;; Helper for emitting `MInst.MovToVec128` instructions. +(decl mov_to_vec128 (Type Reg Reg) Reg) +(rule (mov_to_vec128 ty src1 src2) (let ((dst WritableReg (temp_writable_reg ty)) - (_ Unit (emit (MInst.VecSelect dst src1 src2 src3)))) + (_ Unit (emit (MInst.MovToVec128 dst src1 src2)))) + dst)) + +;; Helper for emitting `MInst.VecLoadConst` instructions. +(decl vec_load_const (Type u128) Reg) +(rule (vec_load_const (ty_vec128 ty) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadConst dst n)))) + dst)) + +;; Helper for emitting `MInst.VecLoadConstReplicate` instructions. +(decl vec_load_const_replicate (Type u64) Reg) +(rule (vec_load_const_replicate ty @ (multi_lane size _) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadConstReplicate size dst n)))) + dst)) + +;; Helper for emitting `MInst.VecImmByteMask` instructions. +(decl vec_imm_byte_mask (Type u16) Reg) +(rule (vec_imm_byte_mask (ty_vec128 ty) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmByteMask dst n)))) + dst)) + +;; Helper for emitting `MInst.VecImmBitMask` instructions. +(decl vec_imm_bit_mask (Type u8 u8) Reg) +(rule (vec_imm_bit_mask (ty_vec128 ty @ (multi_lane size _)) start_bit end_bit) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmBitMask size dst start_bit end_bit)))) + dst)) + +;; Helper for emitting `MInst.VecImmReplicate` instructions. +(decl vec_imm_replicate (Type i16) Reg) +(rule (vec_imm_replicate (ty_vec128 ty @ (multi_lane size _)) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmReplicate size dst n)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLane` instructions. +(decl vec_load_lane (Type Reg MemArg u8) Reg) +(rule (vec_load_lane ty @ (multi_lane size _) src addr lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecLoadLane size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneUndef` instructions. +(decl vec_load_lane_undef (Type MemArg u8) Reg) +(rule (vec_load_lane_undef ty @ (multi_lane size _) addr lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadLaneUndef size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneRev` instructions. +(decl vec_load_lane_rev (Type Reg MemArg u8) Reg) +(rule (vec_load_lane_rev ty @ (multi_lane size _) src addr lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecLoadLaneRev size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneRevUndef` instructions. +(decl vec_load_lane_rev_undef (Type MemArg u8) Reg) +(rule (vec_load_lane_rev_undef ty @ (multi_lane size _) addr lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadLaneRevUndef size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecStoreLane` instructions. +(decl vec_store_lane (Type Reg MemArg u8) SideEffectNoResult) +(rule (vec_store_lane ty @ (multi_lane size _) src addr lane_imm) + (SideEffectNoResult.Inst (MInst.VecStoreLane size src addr lane_imm))) + +;; Helper for emitting `MInst.VecStoreLaneRev` instructions. +(decl vec_store_lane_rev (Type Reg MemArg u8) SideEffectNoResult) +(rule (vec_store_lane_rev ty @ (multi_lane size _) src addr lane_imm) + (SideEffectNoResult.Inst (MInst.VecStoreLaneRev size src addr lane_imm))) + +;; Helper for emitting `MInst.VecInsertLane` instructions. +(decl vec_insert_lane (Type Reg Reg u8 Reg) Reg) +(rule (vec_insert_lane ty @ (multi_lane size _) src1 src2 lane_imm lane_reg) + (let ((dst WritableReg (copy_writable_reg ty src1)) + (_ Unit (emit (MInst.VecInsertLane size dst src2 lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecInsertLaneUndef` instructions. +(decl vec_insert_lane_undef (Type Reg u8 Reg) Reg) +(rule (vec_insert_lane_undef ty @ (multi_lane size _) src lane_imm lane_reg) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecInsertLaneUndef size dst src lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecExtractLane` instructions. +(decl vec_extract_lane (Type Reg u8 Reg) Reg) +(rule (vec_extract_lane (multi_lane size _) src lane_imm lane_reg) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.VecExtractLane size dst src lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecInsertLaneImm` instructions. +(decl vec_insert_lane_imm (Type Reg i16 u8) Reg) +(rule (vec_insert_lane_imm ty @ (multi_lane size _) src imm lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecInsertLaneImm size dst imm lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecReplicateLane` instructions. +(decl vec_replicate_lane (Type Reg u8) Reg) +(rule (vec_replicate_lane ty @ (multi_lane size _) src lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecReplicateLane size dst src lane_imm)))) dst)) ;; Helper for emitting `MInst.LoadExtNameFar` instructions. @@ -1858,6 +2465,9 @@ (rule (emit_mov $F64 dst src) (emit (MInst.FpuMove64 dst src))) +(rule (emit_mov (ty_vec128 ty) dst src) + (emit (MInst.VecMov dst src))) + ;; Allocate a temporary (writable) register, initialized as a copy of the input. (decl copy_writable_reg (Type Reg) WritableReg) (rule (copy_writable_reg ty src) @@ -1888,8 +2498,12 @@ (rule (emit_arg_store $I32 reg mem) (emit_side_effect (store32 reg mem))) (rule (emit_arg_store $I64 reg mem) (emit_side_effect (store64 reg mem))) (rule (emit_arg_store $R64 reg mem) (emit_side_effect (store64 reg mem))) -(rule (emit_arg_store $F32 reg mem) (emit_side_effect (fpu_store32 reg mem))) -(rule (emit_arg_store $F64 reg mem) (emit_side_effect (fpu_store64 reg mem))) +(rule (emit_arg_store $F32 reg mem) + (emit_side_effect (vec_store_lane $F32X4 reg mem 0))) +(rule (emit_arg_store $F64 reg mem) + (emit_side_effect (vec_store_lane $F64X2 reg mem 0))) +(rule (emit_arg_store (ty_vec128 ty) reg mem) + (emit_side_effect (vec_store reg mem))) (decl emit_arg_load (Type MemArg) Reg) (rule (emit_arg_load $I8 mem) (zext32_mem $I8 mem)) @@ -1897,8 +2511,9 @@ (rule (emit_arg_load $I32 mem) (load32 mem)) (rule (emit_arg_load $I64 mem) (load64 mem)) (rule (emit_arg_load $R64 mem) (load64 mem)) -(rule (emit_arg_load $F32 mem) (fpu_load64 mem)) -(rule (emit_arg_load $F64 mem) (fpu_load64 mem)) +(rule (emit_arg_load $F32 mem) (vec_load_lane_undef $F32X4 mem 0)) +(rule (emit_arg_load $F64 mem) (vec_load_lane_undef $F64X2 mem 0)) +(rule (emit_arg_load (ty_vec128 ty) mem) (vec_load ty mem)) ;; Copy a single argument/return value to its slots. (decl copy_to_arg (i64 ABIArg Value) Unit) @@ -2026,6 +2641,36 @@ (_ Unit (emit (MInst.Mov64SImm32 dst n)))) (writable_reg_to_reg dst))) +;; Allocate a temporary register, initialized with a vector immediate. +(decl vec_imm (Type u128) Reg) +(rule (vec_imm (ty_vec128 ty) 0) + (vec_imm_byte_mask ty 0)) +(rule (vec_imm (ty_vec128 ty) (u64_pair n n)) + (vec_imm_splat $I64X2 n)) +(rule (vec_imm (ty_vec128 ty) n) + (vec_load_const ty n)) + +;; Variant with replicated immediate. +(decl vec_imm_splat (Type u64) Reg) +(rule (vec_imm_splat (ty_vec128 ty) 0) + (vec_imm_byte_mask ty 0)) +(rule (vec_imm_splat ty @ (multi_lane 8 _) n) + (vec_imm_replicate ty (u64_as_i16 n))) +(rule (vec_imm_splat ty @ (multi_lane 16 _) n) + (vec_imm_replicate ty (u64_as_i16 n))) +(rule (vec_imm_splat ty @ (multi_lane 32 _) (u32_pair _ (i16_from_u32 n))) + (vec_imm_replicate ty n)) +(rule (vec_imm_splat ty @ (multi_lane 64 _) (i16_from_u64 n)) + (vec_imm_replicate ty n)) +(rule (vec_imm_splat (multi_lane 16 _) (u32_pair _ (u16_pair _ (u8_pair n n)))) + (vec_imm_splat $I8X16 (u8_as_u64 n))) +(rule (vec_imm_splat (multi_lane 32 _) (u32_pair _ (u16_pair n n))) + (vec_imm_splat $I16X8 (u16_as_u64 n))) +(rule (vec_imm_splat (multi_lane 64 _) (u32_pair n n)) + (vec_imm_splat $I32X4 (u32_as_u64 n))) +(rule (vec_imm_splat (ty_vec128 ty) n) + (vec_load_const_replicate ty n)) + ;; Place an immediate into the low half of a register pair. ;; The high half is taken from the input. (decl imm_regpair_lo (Type u64 RegPair) RegPair) @@ -2337,6 +2982,10 @@ (rule (emit_cmov_reg $F64 dst cond src) (ConsumesFlags.ConsumesFlagsReturnsReg (MInst.FpuCMov64 dst cond src) dst)) +(rule (emit_cmov_reg (ty_vec128 ty) dst cond src) + (ConsumesFlags.ConsumesFlagsReturnsReg (MInst.VecCMov dst cond src) + dst)) + ;; Conditionally select between two source registers. (decl cmov_reg (Type Cond Reg Reg) ConsumesFlags) @@ -2488,7 +3137,7 @@ (decl casloop_emit (VecMInstBuilder Type MemFlags Reg Reg) Reg) (rule (casloop_emit ib ty flags aligned_addr val) (let (;; Construct a memory argument for the aligned word. - (aligned_mem MemArg (memarg_reg_plus_off aligned_addr 0 flags)) + (aligned_mem MemArg (memarg_reg_plus_off aligned_addr 0 0 flags)) ;; Add the compare-and-swap instruction to the builder. (result Reg (push_atomic_cas ib (ty_ext32 ty) (casloop_val_reg) val aligned_mem)) @@ -2607,6 +3256,91 @@ (extern constructor abi_accumulate_outgoing_args_size abi_accumulate_outgoing_args_size) +;; Helpers for generating vector pack and unpack instructions ;;;;;;;;;;;;;;;;;; + +(decl vec_widen_type (Type) Type) +(rule (vec_widen_type $I8X16) $I16X8) +(rule (vec_widen_type $I16X8) $I32X4) +(rule (vec_widen_type $I32X4) $I64X2) + +(decl vecop_pack (Type) VecBinaryOp) +(rule (vecop_pack $I16X8) (VecBinaryOp.Pack16x8)) +(rule (vecop_pack $I32X4) (VecBinaryOp.Pack32x4)) +(rule (vecop_pack $I64X2) (VecBinaryOp.Pack64x2)) + +(decl vec_pack (Type Reg Reg) Reg) +(rule (vec_pack ty x y) (vec_rrr ty (vecop_pack ty) x y)) + +(decl vecop_pack_ssat (Type) VecBinaryOp) +(rule (vecop_pack_ssat $I16X8) (VecBinaryOp.PackSSat16x8)) +(rule (vecop_pack_ssat $I32X4) (VecBinaryOp.PackSSat32x4)) +(rule (vecop_pack_ssat $I64X2) (VecBinaryOp.PackSSat64x2)) + +(decl vec_pack_ssat (Type Reg Reg) Reg) +(rule (vec_pack_ssat ty x y) (vec_rrr ty (vecop_pack_ssat ty) x y)) + +(decl vecop_pack_usat (Type) VecBinaryOp) +(rule (vecop_pack_usat $I16X8) (VecBinaryOp.PackUSat16x8)) +(rule (vecop_pack_usat $I32X4) (VecBinaryOp.PackUSat32x4)) +(rule (vecop_pack_usat $I64X2) (VecBinaryOp.PackUSat64x2)) + +(decl vec_pack_usat (Type Reg Reg) Reg) +(rule (vec_pack_usat ty x y) (vec_rrr ty (vecop_pack_usat ty) x y)) + +(decl vecop_unpacks_low (Type) VecUnaryOp) +(rule (vecop_unpacks_low $I8X16) (VecUnaryOp.UnpackSLow8x16)) +(rule (vecop_unpacks_low $I16X8) (VecUnaryOp.UnpackSLow16x8)) +(rule (vecop_unpacks_low $I32X4) (VecUnaryOp.UnpackSLow32x4)) + +(decl vec_unpacks_low (Type Reg) Reg) +(rule (vec_unpacks_low ty x) (vec_rr ty (vecop_unpacks_low ty) x)) + +(decl vecop_unpacks_high (Type) VecUnaryOp) +(rule (vecop_unpacks_high $I8X16) (VecUnaryOp.UnpackSHigh8x16)) +(rule (vecop_unpacks_high $I16X8) (VecUnaryOp.UnpackSHigh16x8)) +(rule (vecop_unpacks_high $I32X4) (VecUnaryOp.UnpackSHigh32x4)) + +(decl vec_unpacks_high (Type Reg) Reg) +(rule (vec_unpacks_high ty x) (vec_rr ty (vecop_unpacks_high ty) x)) + +(decl vecop_unpacku_low (Type) VecUnaryOp) +(rule (vecop_unpacku_low $I8X16) (VecUnaryOp.UnpackULow8x16)) +(rule (vecop_unpacku_low $I16X8) (VecUnaryOp.UnpackULow16x8)) +(rule (vecop_unpacku_low $I32X4) (VecUnaryOp.UnpackULow32x4)) + +(decl vec_unpacku_low (Type Reg) Reg) +(rule (vec_unpacku_low ty x) (vec_rr ty (vecop_unpacku_low ty) x)) + +(decl vecop_unpacku_high (Type) VecUnaryOp) +(rule (vecop_unpacku_high $I8X16) (VecUnaryOp.UnpackUHigh8x16)) +(rule (vecop_unpacku_high $I16X8) (VecUnaryOp.UnpackUHigh16x8)) +(rule (vecop_unpacku_high $I32X4) (VecUnaryOp.UnpackUHigh32x4)) + +(decl vec_unpacku_high (Type Reg) Reg) +(rule (vec_unpacku_high ty x) (vec_rr ty (vecop_unpacku_high ty) x)) + + +;; Helpers for generating vector merge instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_merge_low (Type) VecBinaryOp) +(rule (vecop_merge_low $I8X16) (VecBinaryOp.MergeLow8x16)) +(rule (vecop_merge_low $I16X8) (VecBinaryOp.MergeLow16x8)) +(rule (vecop_merge_low $I32X4) (VecBinaryOp.MergeLow32x4)) +(rule (vecop_merge_low $I64X2) (VecBinaryOp.MergeLow64x2)) + +(decl vec_merge_low (Type Reg Reg) Reg) +(rule (vec_merge_low ty x y) (vec_rrr ty (vecop_merge_low ty) x y)) + +(decl vecop_merge_high (Type) VecBinaryOp) +(rule (vecop_merge_high $I8X16) (VecBinaryOp.MergeHigh8x16)) +(rule (vecop_merge_high $I16X8) (VecBinaryOp.MergeHigh16x8)) +(rule (vecop_merge_high $I32X4) (VecBinaryOp.MergeHigh32x4)) +(rule (vecop_merge_high $I64X2) (VecBinaryOp.MergeHigh64x2)) + +(decl vec_merge_high (Type Reg Reg) Reg) +(rule (vec_merge_high ty x y) (vec_rrr ty (vecop_merge_high ty) x y)) + + ;; Helpers for generating `clz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Count leading zeroes. For a zero input, return the specified value. @@ -2711,6 +3445,15 @@ (decl add_mem_sext32 (Type Reg MemArg) Reg) (rule (add_mem_sext32 ty x y) (alu_rx ty (aluop_add_sext32 ty) x y)) +(decl vecop_add (Type) VecBinaryOp) +(rule (vecop_add $I8X16) (VecBinaryOp.Add8x16)) +(rule (vecop_add $I16X8) (VecBinaryOp.Add16x8)) +(rule (vecop_add $I32X4) (VecBinaryOp.Add32x4)) +(rule (vecop_add $I64X2) (VecBinaryOp.Add64x2)) + +(decl vec_add (Type Reg Reg) Reg) +(rule (vec_add ty x y) (vec_rrr ty (vecop_add ty) x y)) + ;; Helpers for generating `add_logical` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2768,6 +3511,15 @@ (decl sub_mem_sext32 (Type Reg MemArg) Reg) (rule (sub_mem_sext32 ty x y) (alu_rx ty (aluop_sub_sext32 ty) x y)) +(decl vecop_sub (Type) VecBinaryOp) +(rule (vecop_sub $I8X16) (VecBinaryOp.Sub8x16)) +(rule (vecop_sub $I16X8) (VecBinaryOp.Sub16x8)) +(rule (vecop_sub $I32X4) (VecBinaryOp.Sub32x4)) +(rule (vecop_sub $I64X2) (VecBinaryOp.Sub64x2)) + +(decl vec_sub (Type Reg Reg) Reg) +(rule (vec_sub ty x y) (vec_rrr ty (vecop_sub ty) x y)) + ;; Helpers for generating `sub_logical` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2831,6 +3583,69 @@ (decl mul_mem_sext32 (Type Reg MemArg) Reg) (rule (mul_mem_sext32 ty x y) (alu_rx ty (aluop_mul_sext32 ty) x y)) +(decl vecop_mul (Type) VecBinaryOp) +(rule (vecop_mul $I8X16) (VecBinaryOp.Mul8x16)) +(rule (vecop_mul $I16X8) (VecBinaryOp.Mul16x8)) +(rule (vecop_mul $I32X4) (VecBinaryOp.Mul32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_mul (Type Reg Reg) Reg) +(rule (vec_mul ty x y) (vec_rrr ty (vecop_mul ty) x y)) + +(decl vecop_umulhi (Type) VecBinaryOp) +(rule (vecop_umulhi $I8X16) (VecBinaryOp.UMulHi8x16)) +(rule (vecop_umulhi $I16X8) (VecBinaryOp.UMulHi16x8)) +(rule (vecop_umulhi $I32X4) (VecBinaryOp.UMulHi32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umulhi (Type Reg Reg) Reg) +(rule (vec_umulhi ty x y) (vec_rrr ty (vecop_umulhi ty) x y)) + +(decl vecop_smulhi (Type) VecBinaryOp) +(rule (vecop_smulhi $I8X16) (VecBinaryOp.SMulHi8x16)) +(rule (vecop_smulhi $I16X8) (VecBinaryOp.SMulHi16x8)) +(rule (vecop_smulhi $I32X4) (VecBinaryOp.SMulHi32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smulhi (Type Reg Reg) Reg) +(rule (vec_smulhi ty x y) (vec_rrr ty (vecop_smulhi ty) x y)) + +(decl vecop_umul_even (Type) VecBinaryOp) +(rule (vecop_umul_even $I8X16) (VecBinaryOp.UMulEven8x16)) +(rule (vecop_umul_even $I16X8) (VecBinaryOp.UMulEven16x8)) +(rule (vecop_umul_even $I32X4) (VecBinaryOp.UMulEven32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umul_even (Type Reg Reg) Reg) +(rule (vec_umul_even ty x y) (vec_rrr ty (vecop_umul_even ty) x y)) + +(decl vecop_smul_even (Type) VecBinaryOp) +(rule (vecop_smul_even $I8X16) (VecBinaryOp.SMulEven8x16)) +(rule (vecop_smul_even $I16X8) (VecBinaryOp.SMulEven16x8)) +(rule (vecop_smul_even $I32X4) (VecBinaryOp.SMulEven32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smul_even (Type Reg Reg) Reg) +(rule (vec_smul_even ty x y) (vec_rrr ty (vecop_smul_even ty) x y)) + +(decl vecop_umul_odd (Type) VecBinaryOp) +(rule (vecop_umul_odd $I8X16) (VecBinaryOp.UMulOdd8x16)) +(rule (vecop_umul_odd $I16X8) (VecBinaryOp.UMulOdd16x8)) +(rule (vecop_umul_odd $I32X4) (VecBinaryOp.UMulOdd32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umul_odd (Type Reg Reg) Reg) +(rule (vec_umul_odd ty x y) (vec_rrr ty (vecop_umul_odd ty) x y)) + +(decl vecop_smul_odd (Type) VecBinaryOp) +(rule (vecop_smul_odd $I8X16) (VecBinaryOp.SMulOdd8x16)) +(rule (vecop_smul_odd $I16X8) (VecBinaryOp.SMulOdd16x8)) +(rule (vecop_smul_odd $I32X4) (VecBinaryOp.SMulOdd32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smul_odd (Type Reg Reg) Reg) +(rule (vec_smul_odd ty x y) (vec_rrr ty (vecop_smul_odd ty) x y)) + ;; Helpers for generating `udivmod` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2846,6 +3661,66 @@ (rule (sdivmod $I64 x y) (sdivmod64 x y)) +;; Helpers for generating `umax` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_umax (Type) VecBinaryOp) +(rule (vecop_umax $I8X16) (VecBinaryOp.UMax8x16)) +(rule (vecop_umax $I16X8) (VecBinaryOp.UMax16x8)) +(rule (vecop_umax $I32X4) (VecBinaryOp.UMax32x4)) +(rule (vecop_umax $I64X2) (VecBinaryOp.UMax64x2)) + +(decl vec_umax (Type Reg Reg) Reg) +(rule (vec_umax ty x y) (vec_rrr ty (vecop_umax ty) x y)) + + +;; Helpers for generating `imax` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_smax (Type) VecBinaryOp) +(rule (vecop_smax $I8X16) (VecBinaryOp.SMax8x16)) +(rule (vecop_smax $I16X8) (VecBinaryOp.SMax16x8)) +(rule (vecop_smax $I32X4) (VecBinaryOp.SMax32x4)) +(rule (vecop_smax $I64X2) (VecBinaryOp.SMax64x2)) + +(decl vec_smax (Type Reg Reg) Reg) +(rule (vec_smax ty x y) (vec_rrr ty (vecop_smax ty) x y)) + + +;; Helpers for generating `umin` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_umin (Type) VecBinaryOp) +(rule (vecop_umin $I8X16) (VecBinaryOp.UMin8x16)) +(rule (vecop_umin $I16X8) (VecBinaryOp.UMin16x8)) +(rule (vecop_umin $I32X4) (VecBinaryOp.UMin32x4)) +(rule (vecop_umin $I64X2) (VecBinaryOp.UMin64x2)) + +(decl vec_umin (Type Reg Reg) Reg) +(rule (vec_umin ty x y) (vec_rrr ty (vecop_umin ty) x y)) + + +;; Helpers for generating `imin` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_smin (Type) VecBinaryOp) +(rule (vecop_smin $I8X16) (VecBinaryOp.SMin8x16)) +(rule (vecop_smin $I16X8) (VecBinaryOp.SMin16x8)) +(rule (vecop_smin $I32X4) (VecBinaryOp.SMin32x4)) +(rule (vecop_smin $I64X2) (VecBinaryOp.SMin64x2)) + +(decl vec_smin (Type Reg Reg) Reg) +(rule (vec_smin ty x y) (vec_rrr ty (vecop_smin ty) x y)) + + +;; Helpers for generating `avg_round` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_uavg (Type) VecBinaryOp) +(rule (vecop_uavg $I8X16) (VecBinaryOp.UAvg8x16)) +(rule (vecop_uavg $I16X8) (VecBinaryOp.UAvg16x8)) +(rule (vecop_uavg $I32X4) (VecBinaryOp.UAvg32x4)) +(rule (vecop_uavg $I64X2) (VecBinaryOp.UAvg64x2)) + +(decl vec_uavg (Type Reg Reg) Reg) +(rule (vec_uavg ty x y) (vec_rrr ty (vecop_uavg ty) x y)) + + ;; Helpers for generating `and` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl aluop_and (Type) ALUOp) @@ -2864,6 +3739,9 @@ (decl and_mem (Type Reg MemArg) Reg) (rule (and_mem ty x y) (alu_rx ty (aluop_and ty) x y)) +(decl vec_and (Type Reg Reg) Reg) +(rule (vec_and (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.And128) x y)) + ;; Helpers for generating `or` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2883,6 +3761,9 @@ (decl or_mem (Type Reg MemArg) Reg) (rule (or_mem ty x y) (alu_rx ty (aluop_or ty) x y)) +(decl vec_or (Type Reg Reg) Reg) +(rule (vec_or (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.Orr128) x y)) + ;; Helpers for generating `xor` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2903,6 +3784,10 @@ (rule (push_xor_uimm32shifted ib ty dst src imm) (push_alu_uimm32shifted ib (aluop_xor ty) dst src imm)) +(decl vec_xor (Type Reg Reg) Reg) +(rule (vec_xor (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.Xor128) x y)) + + ;; Helpers for generating `not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl not_reg (Type Reg) Reg) @@ -2920,6 +3805,9 @@ (let ((val Reg (push_xor_uimm32shifted ib ty dst src (uimm32shifted 0xffffffff 0)))) (push_xor_uimm32shifted ib ty dst val (uimm32shifted 0xffffffff 32)))) +(decl vec_not (Type Reg) Reg) +(rule (vec_not ty x) (vec_not_or ty x x)) + ;; Helpers for generating `not_and` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2930,6 +3818,9 @@ (decl not_and_reg (Type Reg Reg) Reg) (rule (not_and_reg ty x y) (alu_rrr ty (aluop_not_and ty) x y)) +(decl vec_not_and (Type Reg Reg) Reg) +(rule (vec_not_and (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotAnd128) x y)) + ;; Helpers for generating `not_or` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2940,6 +3831,9 @@ (decl not_or_reg (Type Reg Reg) Reg) (rule (not_or_reg ty x y) (alu_rrr ty (aluop_not_or ty) x y)) +(decl vec_not_or (Type Reg Reg) Reg) +(rule (vec_not_or (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotOrr128) x y)) + ;; Helpers for generating `not_xor` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2950,6 +3844,9 @@ (decl not_xor_reg (Type Reg Reg) Reg) (rule (not_xor_reg ty x y) (alu_rrr ty (aluop_not_xor ty) x y)) +(decl vec_not_xor (Type Reg Reg) Reg) +(rule (vec_not_xor (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotXor128) x y)) + ;; Helpers for generating `and_not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2960,6 +3857,9 @@ (decl and_not_reg (Type Reg Reg) Reg) (rule (and_not_reg ty x y) (alu_rrr ty (aluop_and_not ty) x y)) +(decl vec_and_not (Type Reg Reg) Reg) +(rule (vec_and_not (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.AndNot128) x y)) + ;; Helpers for generating `or_not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2970,6 +3870,15 @@ (decl or_not_reg (Type Reg Reg) Reg) (rule (or_not_reg ty x y) (alu_rrr ty (aluop_or_not ty) x y)) +(decl vec_or_not (Type Reg Reg) Reg) +(rule (vec_or_not (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.OrrNot128) x y)) + + +;; Helpers for generating `bitpermute` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vec_bitpermute (Reg Reg) Reg) +(rule (vec_bitpermute x y) (vec_rrr $I64X2 (VecBinaryOp.BitPermute128) x y)) + ;; Helpers for generating `abs` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2986,6 +3895,15 @@ (decl abs_reg_sext32 (Type Reg) Reg) (rule (abs_reg_sext32 ty x) (unary_rr ty (unaryop_abs_sext32 ty) x)) +(decl vecop_abs (Type) VecUnaryOp) +(rule (vecop_abs $I8X16) (VecUnaryOp.Abs8x16)) +(rule (vecop_abs $I16X8) (VecUnaryOp.Abs16x8)) +(rule (vecop_abs $I32X4) (VecUnaryOp.Abs32x4)) +(rule (vecop_abs $I64X2) (VecUnaryOp.Abs64x2)) + +(decl vec_abs (Type Reg) Reg) +(rule (vec_abs ty x) (vec_rr ty (vecop_abs ty) x)) + ;; Helpers for generating `neg` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3004,6 +3922,15 @@ (decl neg_reg_sext32 (Type Reg) Reg) (rule (neg_reg_sext32 ty x) (unary_rr ty (unaryop_neg_sext32 ty) x)) +(decl vecop_neg (Type) VecUnaryOp) +(rule (vecop_neg $I8X16) (VecUnaryOp.Neg8x16)) +(rule (vecop_neg $I16X8) (VecUnaryOp.Neg16x8)) +(rule (vecop_neg $I32X4) (VecUnaryOp.Neg32x4)) +(rule (vecop_neg $I64X2) (VecUnaryOp.Neg64x2)) + +(decl vec_neg (Type Reg) Reg) +(rule (vec_neg ty x) (vec_rr ty (vecop_neg ty) x)) + ;; Helpers for generating `bswap` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3040,6 +3967,20 @@ (rule (push_rot_imm_reg ib ty dst src shift_imm shift_reg) (push_shift ib (shiftop_rot ty) dst src shift_imm shift_reg)) +(decl vec_shiftop_rot (Type) VecShiftOp) +(rule (vec_shiftop_rot $I8X16) (VecShiftOp.RotL8x16)) +(rule (vec_shiftop_rot $I16X8) (VecShiftOp.RotL16x8)) +(rule (vec_shiftop_rot $I32X4) (VecShiftOp.RotL32x4)) +(rule (vec_shiftop_rot $I64X2) (VecShiftOp.RotL64x2)) + +(decl vec_rot_reg (Type Reg Reg) Reg) +(rule (vec_rot_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_rot ty) x 0 shift_reg)) + +(decl vec_rot_imm (Type Reg u8) Reg) +(rule (vec_rot_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_rot ty) x shift_imm (zero_reg))) + ;; Helpers for generating `lshl` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3057,6 +3998,23 @@ (rule (lshl_imm ty x shift_imm) (shift_rr ty (shiftop_lshl ty) x shift_imm (zero_reg))) +(decl vec_shiftop_lshl (Type) VecShiftOp) +(rule (vec_shiftop_lshl $I8X16) (VecShiftOp.LShL8x16)) +(rule (vec_shiftop_lshl $I16X8) (VecShiftOp.LShL16x8)) +(rule (vec_shiftop_lshl $I32X4) (VecShiftOp.LShL32x4)) +(rule (vec_shiftop_lshl $I64X2) (VecShiftOp.LShL64x2)) + +(decl vec_lshl_reg (Type Reg Reg) Reg) +(rule (vec_lshl_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_lshl ty) x 0 shift_reg)) + +(decl vec_lshl_imm (Type Reg u8) Reg) +(rule (vec_lshl_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_lshl ty) x shift_imm (zero_reg))) + +(decl vec_lshl_by_byte (Reg Reg) Reg) +(rule (vec_lshl_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.LShLByByte128) x y)) + ;; Helpers for generating `lshr` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3072,6 +4030,23 @@ (rule (lshr_imm ty x shift_imm) (shift_rr ty (shiftop_lshr ty) x shift_imm (zero_reg))) +(decl vec_shiftop_lshr (Type) VecShiftOp) +(rule (vec_shiftop_lshr $I8X16) (VecShiftOp.LShR8x16)) +(rule (vec_shiftop_lshr $I16X8) (VecShiftOp.LShR16x8)) +(rule (vec_shiftop_lshr $I32X4) (VecShiftOp.LShR32x4)) +(rule (vec_shiftop_lshr $I64X2) (VecShiftOp.LShR64x2)) + +(decl vec_lshr_reg (Type Reg Reg) Reg) +(rule (vec_lshr_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_lshr ty) x 0 shift_reg)) + +(decl vec_lshr_imm (Type Reg u8) Reg) +(rule (vec_lshr_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_lshr ty) x shift_imm (zero_reg))) + +(decl vec_lshr_by_byte (Reg Reg) Reg) +(rule (vec_lshr_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.LShRByByte128) x y)) + ;; Helpers for generating `ashr` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3087,6 +4062,23 @@ (rule (ashr_imm ty x shift_imm) (shift_rr ty (shiftop_ashr ty) x shift_imm (zero_reg))) +(decl vec_shiftop_ashr (Type) VecShiftOp) +(rule (vec_shiftop_ashr $I8X16) (VecShiftOp.AShR8x16)) +(rule (vec_shiftop_ashr $I16X8) (VecShiftOp.AShR16x8)) +(rule (vec_shiftop_ashr $I32X4) (VecShiftOp.AShR32x4)) +(rule (vec_shiftop_ashr $I64X2) (VecShiftOp.AShR64x2)) + +(decl vec_ashr_reg (Type Reg Reg) Reg) +(rule (vec_ashr_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_ashr ty) x 0 shift_reg)) + +(decl vec_ashr_imm (Type Reg u8) Reg) +(rule (vec_ashr_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_ashr ty) x shift_imm (zero_reg))) + +(decl vec_ashr_by_byte (Reg Reg) Reg) +(rule (vec_ashr_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.AShRByByte128) x y)) + ;; Helpers for generating `popcnt` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3096,6 +4088,15 @@ (decl popcnt_reg (Reg) Reg) (rule (popcnt_reg x) (unary_rr $I64 (UnaryOp.PopcntReg) x)) +(decl vecop_popcnt (Type) VecUnaryOp) +(rule (vecop_popcnt $I8X16) (VecUnaryOp.Popcnt8x16)) +(rule (vecop_popcnt $I16X8) (VecUnaryOp.Popcnt16x8)) +(rule (vecop_popcnt $I32X4) (VecUnaryOp.Popcnt32x4)) +(rule (vecop_popcnt $I64X2) (VecUnaryOp.Popcnt64x2)) + +(decl vec_popcnt (Type Reg) Reg) +(rule (vec_popcnt ty x) (vec_rr ty (vecop_popcnt ty) x)) + ;; Helpers for generating `atomic_rmw` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3132,6 +4133,8 @@ (decl fpuop2_add (Type) FPUOp2) (rule (fpuop2_add $F32) (FPUOp2.Add32)) (rule (fpuop2_add $F64) (FPUOp2.Add64)) +(rule (fpuop2_add $F32X4) (FPUOp2.Add32x4)) +(rule (fpuop2_add $F64X2) (FPUOp2.Add64x2)) (decl fadd_reg (Type Reg Reg) Reg) (rule (fadd_reg ty x y) (fpu_rrr ty (fpuop2_add ty) x y)) @@ -3142,6 +4145,8 @@ (decl fpuop2_sub (Type) FPUOp2) (rule (fpuop2_sub $F32) (FPUOp2.Sub32)) (rule (fpuop2_sub $F64) (FPUOp2.Sub64)) +(rule (fpuop2_sub $F32X4) (FPUOp2.Sub32x4)) +(rule (fpuop2_sub $F64X2) (FPUOp2.Sub64x2)) (decl fsub_reg (Type Reg Reg) Reg) (rule (fsub_reg ty x y) (fpu_rrr ty (fpuop2_sub ty) x y)) @@ -3152,6 +4157,8 @@ (decl fpuop2_mul (Type) FPUOp2) (rule (fpuop2_mul $F32) (FPUOp2.Mul32)) (rule (fpuop2_mul $F64) (FPUOp2.Mul64)) +(rule (fpuop2_mul $F32X4) (FPUOp2.Mul32x4)) +(rule (fpuop2_mul $F64X2) (FPUOp2.Mul64x2)) (decl fmul_reg (Type Reg Reg) Reg) (rule (fmul_reg ty x y) (fpu_rrr ty (fpuop2_mul ty) x y)) @@ -3162,6 +4169,8 @@ (decl fpuop2_div (Type) FPUOp2) (rule (fpuop2_div $F32) (FPUOp2.Div32)) (rule (fpuop2_div $F64) (FPUOp2.Div64)) +(rule (fpuop2_div $F32X4) (FPUOp2.Div32x4)) +(rule (fpuop2_div $F64X2) (FPUOp2.Div64x2)) (decl fdiv_reg (Type Reg Reg) Reg) (rule (fdiv_reg ty x y) (fpu_rrr ty (fpuop2_div ty) x y)) @@ -3172,6 +4181,8 @@ (decl fpuop2_min (Type) FPUOp2) (rule (fpuop2_min $F32) (FPUOp2.Min32)) (rule (fpuop2_min $F64) (FPUOp2.Min64)) +(rule (fpuop2_min $F32X4) (FPUOp2.Min32x4)) +(rule (fpuop2_min $F64X2) (FPUOp2.Min64x2)) (decl fmin_reg (Type Reg Reg) Reg) (rule (fmin_reg ty x y) (fpu_rrr ty (fpuop2_min ty) x y)) @@ -3182,16 +4193,44 @@ (decl fpuop2_max (Type) FPUOp2) (rule (fpuop2_max $F32) (FPUOp2.Max32)) (rule (fpuop2_max $F64) (FPUOp2.Max64)) +(rule (fpuop2_max $F32X4) (FPUOp2.Max32x4)) +(rule (fpuop2_max $F64X2) (FPUOp2.Max64x2)) (decl fmax_reg (Type Reg Reg) Reg) (rule (fmax_reg ty x y) (fpu_rrr ty (fpuop2_max ty) x y)) +;; Helpers for generating `fmin_pseudo` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl fpuop2_min_pseudo (Type) FPUOp2) +(rule (fpuop2_min_pseudo $F32) (FPUOp2.MinPseudo32)) +(rule (fpuop2_min_pseudo $F64) (FPUOp2.MinPseudo64)) +(rule (fpuop2_min_pseudo $F32X4) (FPUOp2.MinPseudo32x4)) +(rule (fpuop2_min_pseudo $F64X2) (FPUOp2.MinPseudo64x2)) + +(decl fmin_pseudo_reg (Type Reg Reg) Reg) +(rule (fmin_pseudo_reg ty x y) (fpu_rrr ty (fpuop2_min_pseudo ty) x y)) + + +;; Helpers for generating `fmax_pseudo` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl fpuop2_max_pseudo (Type) FPUOp2) +(rule (fpuop2_max_pseudo $F32) (FPUOp2.MaxPseudo32)) +(rule (fpuop2_max_pseudo $F64) (FPUOp2.MaxPseudo64)) +(rule (fpuop2_max_pseudo $F32X4) (FPUOp2.MaxPseudo32x4)) +(rule (fpuop2_max_pseudo $F64X2) (FPUOp2.MaxPseudo64x2)) + +(decl fmax_pseudo_reg (Type Reg Reg) Reg) +(rule (fmax_pseudo_reg ty x y) (fpu_rrr ty (fpuop2_max_pseudo ty) x y)) + + ;; Helpers for generating `fma` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl fpuop3_fma (Type) FPUOp3) (rule (fpuop3_fma $F32) (FPUOp3.MAdd32)) (rule (fpuop3_fma $F64) (FPUOp3.MAdd64)) +(rule (fpuop3_fma $F32X4) (FPUOp3.MAdd32x4)) +(rule (fpuop3_fma $F64X2) (FPUOp3.MAdd64x2)) (decl fma_reg (Type Reg Reg Reg) Reg) (rule (fma_reg ty x y acc) (fpu_rrrr ty (fpuop3_fma ty) x y acc)) @@ -3202,6 +4241,8 @@ (decl fpuop1_sqrt (Type) FPUOp1) (rule (fpuop1_sqrt $F32) (FPUOp1.Sqrt32)) (rule (fpuop1_sqrt $F64) (FPUOp1.Sqrt64)) +(rule (fpuop1_sqrt $F32X4) (FPUOp1.Sqrt32x4)) +(rule (fpuop1_sqrt $F64X2) (FPUOp1.Sqrt64x2)) (decl sqrt_reg (Type Reg) Reg) (rule (sqrt_reg ty x) (fpu_rr ty (fpuop1_sqrt ty) x)) @@ -3212,6 +4253,8 @@ (decl fpuop1_neg (Type) FPUOp1) (rule (fpuop1_neg $F32) (FPUOp1.Neg32)) (rule (fpuop1_neg $F64) (FPUOp1.Neg64)) +(rule (fpuop1_neg $F32X4) (FPUOp1.Neg32x4)) +(rule (fpuop1_neg $F64X2) (FPUOp1.Neg64x2)) (decl fneg_reg (Type Reg) Reg) (rule (fneg_reg ty x) (fpu_rr ty (fpuop1_neg ty) x)) @@ -3222,6 +4265,8 @@ (decl fpuop1_abs (Type) FPUOp1) (rule (fpuop1_abs $F32) (FPUOp1.Abs32)) (rule (fpuop1_abs $F64) (FPUOp1.Abs64)) +(rule (fpuop1_abs $F32X4) (FPUOp1.Abs32x4)) +(rule (fpuop1_abs $F64X2) (FPUOp1.Abs64x2)) (decl fabs_reg (Type Reg) Reg) (rule (fabs_reg ty x) (fpu_rr ty (fpuop1_abs ty) x)) @@ -3232,6 +4277,8 @@ (decl fpuroundop_round (Type) FpuRoundOp) (rule (fpuroundop_round $F32) (FpuRoundOp.Round32)) (rule (fpuroundop_round $F64) (FpuRoundOp.Round64)) +(rule (fpuroundop_round $F32X4) (FpuRoundOp.Round32x4)) +(rule (fpuroundop_round $F64X2) (FpuRoundOp.Round64x2)) (decl ceil_reg (Type Reg) Reg) (rule (ceil_reg ty x) (fpu_round ty (fpuroundop_round ty) @@ -3256,6 +4303,8 @@ (rule (fpromote_reg ty ty x) x) (rule (fpromote_reg $F64 $F32 x) (fpu_rr $F64 (FPUOp1.Cvt32To64) x)) +(rule (fpromote_reg $F64X2 $F32X4 x) + (fpu_rr $F64 (FPUOp1.Cvt32x4To64x2) x)) ;; Helpers for generating `fdemote` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3264,28 +4313,34 @@ (rule (fdemote_reg ty ty mode x) x) (rule (fdemote_reg $F32 $F64 mode x) (fpu_round $F32 (FpuRoundOp.Cvt64To32) mode x)) +(rule (fdemote_reg $F32X4 $F64X2 mode x) + (fpu_round $F32X4 (FpuRoundOp.Cvt64x2To32x4) mode x)) ;; Helpers for generating `fcvt_from_uint` instructions ;;;;;;;;;;;;;;;;;;;;;;;; -(decl uint_to_fpu_op (Type) FpuRoundOp) -(rule (uint_to_fpu_op $F32) (FpuRoundOp.FromUInt32)) -(rule (uint_to_fpu_op $F64) (FpuRoundOp.FromUInt64)) - (decl fcvt_from_uint_reg (Type FpuRoundMode Reg) Reg) -(rule (fcvt_from_uint_reg ty mode x) - (fpu_round ty (uint_to_fpu_op ty) mode x)) +(rule (fcvt_from_uint_reg $F32 mode x) + (fpu_round $F32 (FpuRoundOp.FromUInt32) mode (vec_insert_lane_undef $I32X4 x 0 (zero_reg)))) +(rule (fcvt_from_uint_reg $F64 mode x) + (fpu_round $F64 (FpuRoundOp.FromUInt64) mode (vec_insert_lane_undef $I64X2 x 0 (zero_reg)))) +(rule (fcvt_from_uint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.FromUInt32x4) mode x)) +(rule (fcvt_from_uint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.FromUInt64x2) mode x)) ;; Helpers for generating `fcvt_from_sint` instructions ;;;;;;;;;;;;;;;;;;;;;;;; -(decl sint_to_fpu_op (Type) FpuRoundOp) -(rule (sint_to_fpu_op $F32) (FpuRoundOp.FromSInt32)) -(rule (sint_to_fpu_op $F64) (FpuRoundOp.FromSInt64)) - (decl fcvt_from_sint_reg (Type FpuRoundMode Reg) Reg) -(rule (fcvt_from_sint_reg ty mode x) - (fpu_round ty (sint_to_fpu_op ty) mode x)) +(rule (fcvt_from_sint_reg $F32 mode x) + (fpu_round $F32 (FpuRoundOp.FromSInt32) mode (vec_insert_lane_undef $I32X4 x 0 (zero_reg)))) +(rule (fcvt_from_sint_reg $F64 mode x) + (fpu_round $F64 (FpuRoundOp.FromSInt64) mode (vec_insert_lane_undef $I64X2 x 0 (zero_reg)))) +(rule (fcvt_from_sint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.FromSInt32x4) mode x)) +(rule (fcvt_from_sint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.FromSInt64x2) mode x)) ;; Helpers for generating `fcvt_to_[us]int` instructions ;;;;;;;;;;;;;;;;;;;;;;; @@ -3305,9 +4360,13 @@ (decl fcvt_to_uint_reg (Type FpuRoundMode Reg) Reg) (rule (fcvt_to_uint_reg $F32 mode x) - (mov_from_fpr32 (fpu_round $F32 (FpuRoundOp.ToUInt32) mode x))) + (vec_extract_lane $I32X4 (fpu_round $F32 (FpuRoundOp.ToUInt32) mode x) 0 (zero_reg))) (rule (fcvt_to_uint_reg $F64 mode x) - (mov_from_fpr64 (fpu_round $F64 (FpuRoundOp.ToUInt64) mode x))) + (vec_extract_lane $I64X2 (fpu_round $F64 (FpuRoundOp.ToUInt64) mode x) 0 (zero_reg))) +(rule (fcvt_to_uint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.ToUInt32x4) mode x)) +(rule (fcvt_to_uint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.ToUInt64x2) mode x)) (decl fcvt_to_uint_ub (Type Type) Reg) (rule (fcvt_to_uint_ub $F32 dst_ty) @@ -3333,9 +4392,13 @@ (decl fcvt_to_sint_reg (Type FpuRoundMode Reg) Reg) (rule (fcvt_to_sint_reg $F32 mode x) - (mov_from_fpr32 (fpu_round $F32 (FpuRoundOp.ToSInt32) mode x))) + (vec_extract_lane $F32X4 (fpu_round $F32 (FpuRoundOp.ToSInt32) mode x) 0 (zero_reg))) (rule (fcvt_to_sint_reg $F64 mode x) - (mov_from_fpr64 (fpu_round $F64 (FpuRoundOp.ToSInt64) mode x))) + (vec_extract_lane $F64X2 (fpu_round $F64 (FpuRoundOp.ToSInt64) mode x) 0 (zero_reg))) +(rule (fcvt_to_sint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.ToSInt32x4) mode x)) +(rule (fcvt_to_sint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.ToSInt64x2) mode x)) (decl fcvt_to_sint_ub (Type Type) Reg) (rule (fcvt_to_sint_ub $F32 dst_ty) @@ -3426,12 +4489,79 @@ (rule (icmpu_mem_zext32 ty src mem) (cmp_rx (cmpop_cmpu_zext32 ty) src mem)) +;; Helpers for generating vector `icmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_int_cmpeq (Type) VecIntCmpOp) +(rule (vecop_int_cmpeq (multi_lane 8 16)) (VecIntCmpOp.CmpEq8x16)) +(rule (vecop_int_cmpeq (multi_lane 16 8)) (VecIntCmpOp.CmpEq16x8)) +(rule (vecop_int_cmpeq (multi_lane 32 4)) (VecIntCmpOp.CmpEq32x4)) +(rule (vecop_int_cmpeq (multi_lane 64 2)) (VecIntCmpOp.CmpEq64x2)) + +(decl vec_cmpeq (Type Reg Reg) Reg) +(rule (vec_cmpeq (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y)) +(decl vec_cmpeqs (Type Reg Reg) ProducesFlags) +(rule (vec_cmpeqs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y)) + +(decl vecop_int_cmph (Type) VecIntCmpOp) +(rule (vecop_int_cmph (multi_lane 8 16)) (VecIntCmpOp.SCmpHi8x16)) +(rule (vecop_int_cmph (multi_lane 16 8)) (VecIntCmpOp.SCmpHi16x8)) +(rule (vecop_int_cmph (multi_lane 32 4)) (VecIntCmpOp.SCmpHi32x4)) +(rule (vecop_int_cmph (multi_lane 64 2)) (VecIntCmpOp.SCmpHi64x2)) + +(decl vec_cmph (Type Reg Reg) Reg) +(rule (vec_cmph (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y)) +(decl vec_cmphs (Type Reg Reg) ProducesFlags) +(rule (vec_cmphs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y)) + +(decl vecop_int_cmphl (Type) VecIntCmpOp) +(rule (vecop_int_cmphl (multi_lane 8 16)) (VecIntCmpOp.UCmpHi8x16)) +(rule (vecop_int_cmphl (multi_lane 16 8)) (VecIntCmpOp.UCmpHi16x8)) +(rule (vecop_int_cmphl (multi_lane 32 4)) (VecIntCmpOp.UCmpHi32x4)) +(rule (vecop_int_cmphl (multi_lane 64 2)) (VecIntCmpOp.UCmpHi64x2)) + +(decl vec_cmphl (Type Reg Reg) Reg) +(rule (vec_cmphl (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y)) +(decl vec_cmphls (Type Reg Reg) ProducesFlags) +(rule (vec_cmphls (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y)) + + ;; Helpers for generating `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl fcmp_reg (Type Reg Reg) ProducesFlags) (rule (fcmp_reg $F32 src1 src2) (fpu_cmp32 src1 src2)) (rule (fcmp_reg $F64 src1 src2) (fpu_cmp64 src1 src2)) + +;; Helpers for generating vector `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_float_cmpeq (Type) VecFloatCmpOp) +(rule (vecop_float_cmpeq (multi_lane 32 4)) (VecFloatCmpOp.CmpEq32x4)) +(rule (vecop_float_cmpeq (multi_lane 64 2)) (VecFloatCmpOp.CmpEq64x2)) + +(decl vec_fcmpeq (Type Reg Reg) Reg) +(rule (vec_fcmpeq (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmpeq ty) x y)) +(decl vec_fcmpeqs (Type Reg Reg) ProducesFlags) +(rule (vec_fcmpeqs (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmpeq ty) x y)) + +(decl vecop_float_cmph (Type) VecFloatCmpOp) +(rule (vecop_float_cmph (multi_lane 32 4)) (VecFloatCmpOp.CmpHi32x4)) +(rule (vecop_float_cmph (multi_lane 64 2)) (VecFloatCmpOp.CmpHi64x2)) + +(decl vec_fcmph (Type Reg Reg) Reg) +(rule (vec_fcmph (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmph ty) x y)) +(decl vec_fcmphs (Type Reg Reg) ProducesFlags) +(rule (vec_fcmphs (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmph ty) x y)) + +(decl vecop_float_cmphe (Type) VecFloatCmpOp) +(rule (vecop_float_cmphe (multi_lane 32 4)) (VecFloatCmpOp.CmpHiEq32x4)) +(rule (vecop_float_cmphe (multi_lane 64 2)) (VecFloatCmpOp.CmpHiEq64x2)) + +(decl vec_fcmphe (Type Reg Reg) Reg) +(rule (vec_fcmphe (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmphe ty) x y)) +(decl vec_fcmphes (Type Reg Reg) ProducesFlags) +(rule (vec_fcmphes (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmphe ty) x y)) + + ;; Implicit conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (convert WritableRegPair RegPair writable_regpair_to_regpair) diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index a75e6ffaf334..3b7832b7ea58 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -852,6 +852,74 @@ fn enc_siy(opcode: u16, b1: Reg, d1: u32, i2: u8) -> [u8; 6] { enc } +/// VRIa-type instructions. +/// +/// 47 39 35 31 15 11 7 +/// opcode1 v1 - i2 m3 rxb opcode2 +/// 40 36 32 16 12 8 0 +/// +fn enc_vri_a(opcode: u16, v1: Reg, i2: u16, m3: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let m3 = m3 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4; + enc[2..4].copy_from_slice(&i2.to_be_bytes()); + enc[4] = m3 << 4 | rxb; + enc[5] = opcode2; + enc +} + +/// VRIb-type instructions. +/// +/// 47 39 35 31 23 15 11 7 +/// opcode1 v1 - i2 i3 m4 rxb opcode2 +/// 40 36 32 24 16 12 8 0 +/// +fn enc_vri_b(opcode: u16, v1: Reg, i2: u8, i3: u8, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4; + enc[2] = i2; + enc[3] = i3; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + +/// VRIc-type instructions. +/// +/// 47 39 35 31 15 11 7 +/// opcode1 v1 v3 i2 m4 rxb opcode2 +/// 40 36 32 16 12 8 0 +/// +fn enc_vri_c(opcode: u16, v1: Reg, i2: u16, v3: Reg, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v3), None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v3; + enc[2..4].copy_from_slice(&i2.to_be_bytes()); + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRa-type instructions. /// /// 47 39 35 31 23 19 15 11 7 @@ -878,6 +946,32 @@ fn enc_vrr_a(opcode: u16, v1: Reg, v2: Reg, m3: u8, m4: u8, m5: u8) -> [u8; 6] { enc } +/// VRRb-type instructions. +/// +/// 47 39 35 31 27 23 19 15 11 7 +/// opcode1 v1 v2 v3 - m5 - m4 rxb opcode2 +/// 40 36 32 28 24 20 16 12 8 0 +/// +fn enc_vrr_b(opcode: u16, v1: Reg, v2: Reg, v3: Reg, m4: u8, m5: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v2), Some(v3), None); + let v1 = machreg_to_vr(v1) & 0x0f; + let v2 = machreg_to_vr(v2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let m4 = m4 & 0x0f; + let m5 = m5 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v2; + enc[2] = v3 << 4; + enc[3] = m5 << 4; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRc-type instructions. /// /// 47 39 35 31 27 23 19 15 11 7 @@ -932,6 +1026,56 @@ fn enc_vrr_e(opcode: u16, v1: Reg, v2: Reg, v3: Reg, v4: Reg, m5: u8, m6: u8) -> enc } +/// VRRf-type instructions. +/// +/// 47 39 35 31 27 11 7 +/// opcode1 v1 r2 r3 - rxb opcode2 +/// 40 36 32 28 12 8 0 +/// +fn enc_vrr_f(opcode: u16, v1: Reg, r2: Reg, r3: Reg) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let r2 = machreg_to_gpr(r2) & 0x0f; + let r3 = machreg_to_gpr(r3) & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | r2; + enc[2] = r3 << 4; + enc[4] = rxb; + enc[5] = opcode2; + enc +} + +/// VRSa-type instructions. +/// +/// 47 39 35 31 27 15 11 7 +/// opcode1 v1 v3 b2 d2 m4 rxb opcode2 +/// 40 36 32 28 16 12 8 0 +/// +fn enc_vrs_a(opcode: u16, v1: Reg, b2: Reg, d2: u32, v3: Reg, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v3), None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let b2 = machreg_to_gpr(b2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let d2_lo = (d2 & 0xff) as u8; + let d2_hi = ((d2 >> 8) & 0x0f) as u8; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v3; + enc[2] = b2 << 4 | d2_hi; + enc[3] = d2_lo; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRSb-type instructions. /// /// 47 39 35 31 27 15 11 7 @@ -1834,29 +1978,6 @@ impl MachInstEmit for Inst { rd, &mem, opcode_rx, opcode_rxy, opcode_ril, true, sink, emit_info, state, ); } - &Inst::FpuLoad32 { rd, ref mem } - | &Inst::FpuLoad64 { rd, ref mem } - | &Inst::FpuLoadRev32 { rd, ref mem } - | &Inst::FpuLoadRev64 { rd, ref mem } => { - let rd = allocs.next_writable(rd); - let mem = mem.with_allocs(&mut allocs); - - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuLoad32 { .. } => (Some(0x78), Some(0xed64), 0xe703), // LE(Y), VLEF - &Inst::FpuLoad64 { .. } => (Some(0x68), Some(0xed65), 0xe702), // LD(Y), VLEG - &Inst::FpuLoadRev32 { .. } => (None, None, 0xe603), // VLEBRF - &Inst::FpuLoadRev64 { .. } => (None, None, 0xe602), // VLEBRG - _ => unreachable!(), - }; - let rd = rd.to_reg(); - if is_fpr(rd) && opcode_rx.is_some() { - mem_emit( - rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, - ); - } else { - mem_vrx_emit(rd, &mem, opcode_vrx, 0, true, sink, emit_info, state); - } - } &Inst::Store8 { rd, ref mem } | &Inst::Store16 { rd, ref mem } @@ -1904,28 +2025,6 @@ impl MachInstEmit for Inst { }; mem_imm16_emit(imm, &mem, opcode, true, sink, emit_info, state); } - &Inst::FpuStore32 { rd, ref mem } - | &Inst::FpuStore64 { rd, ref mem } - | &Inst::FpuStoreRev32 { rd, ref mem } - | &Inst::FpuStoreRev64 { rd, ref mem } => { - let rd = allocs.next(rd); - let mem = mem.with_allocs(&mut allocs); - - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuStore32 { .. } => (Some(0x70), Some(0xed66), 0xe70b), // STE(Y), VSTEF - &Inst::FpuStore64 { .. } => (Some(0x60), Some(0xed67), 0xe70a), // STD(Y), VSTEG - &Inst::FpuStoreRev32 { .. } => (None, None, 0xe60b), // VSTEBRF - &Inst::FpuStoreRev64 { .. } => (None, None, 0xe60a), // VSTEBRG - _ => unreachable!(), - }; - if is_fpr(rd) && opcode_rx.is_some() { - mem_emit( - rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, - ); - } else { - mem_vrx_emit(rd, &mem, opcode_vrx, 0, true, sink, emit_info, state); - } - } &Inst::LoadMultiple64 { rt, rt2, ref mem } => { let mem = mem.with_allocs(&mut allocs); @@ -2168,44 +2267,6 @@ impl MachInstEmit for Inst { put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0)); } } - &Inst::MovToFpr32 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - let (opcode, m4) = (0xe722, 2); // VLVG - put(sink, &enc_vrs_b(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - &Inst::MovToFpr64 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - if is_fpr(rd.to_reg()) { - let opcode = 0xb3c1; // LDGR - put(sink, &enc_rre(opcode, rd.to_reg(), rn)); - } else { - let (opcode, m4) = (0xe722, 3); // VLVG - put(sink, &enc_vrs_b(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - } - &Inst::MovFromFpr32 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - let (opcode, m4) = (0xe721, 2); // VLGV - put(sink, &enc_vrs_c(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - &Inst::MovFromFpr64 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - if is_fpr(rn) { - let opcode = 0xb3cd; // LGDR - put(sink, &enc_rre(opcode, rd.to_reg(), rn)); - } else { - let (opcode, m4) = (0xe721, 3); // VLVG - put(sink, &enc_vrs_c(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - } &Inst::LoadFpuConst32 { rd, const_data } => { let rd = allocs.next_writable(rd); @@ -2213,9 +2274,11 @@ impl MachInstEmit for Inst { let reg = writable_spilltmp_reg().to_reg(); put(sink, &enc_ri_b(opcode, reg, 8)); sink.put4(const_data.swap_bytes()); - let inst = Inst::FpuLoad32 { + let inst = Inst::VecLoadLaneUndef { + size: 32, rd, mem: MemArg::reg(reg, MemFlags::trusted()), + lane_imm: 0, }; inst.emit(&[], sink, emit_info, state); } @@ -2226,9 +2289,11 @@ impl MachInstEmit for Inst { let reg = writable_spilltmp_reg().to_reg(); put(sink, &enc_ri_b(opcode, reg, 12)); sink.put8(const_data.swap_bytes()); - let inst = Inst::FpuLoad64 { + let inst = Inst::VecLoadLaneUndef { + size: 64, rd, mem: MemArg::reg(reg, MemFlags::trusted()), + lane_imm: 0, }; inst.emit(&[], sink, emit_info, state); } @@ -2236,21 +2301,30 @@ impl MachInstEmit for Inst { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); - let (opcode, m3, m5, opcode_fpr) = match fpu_op { - FPUOp1::Abs32 => (0xe7cc, 2, 2, 0xb300), // VFPSO, LPEBR - FPUOp1::Abs64 => (0xe7cc, 3, 2, 0xb310), // VFPSO, LPDBR - FPUOp1::Neg32 => (0xe7cc, 2, 0, 0xb303), // VFPSO, LCEBR - FPUOp1::Neg64 => (0xe7cc, 3, 0, 0xb313), // VFPSO, LCDBR - FPUOp1::NegAbs32 => (0xe7cc, 2, 1, 0xb301), // VFPSO, LNEBR - FPUOp1::NegAbs64 => (0xe7cc, 3, 1, 0xb311), // VFPSO, LNDBR - FPUOp1::Sqrt32 => (0xe7ce, 2, 0, 0xb314), // VFSQ, SQEBR - FPUOp1::Sqrt64 => (0xe7ce, 3, 0, 0xb315), // VFSQ, SQDBR - FPUOp1::Cvt32To64 => (0xe7c4, 2, 0, 0xb304), // VFLL, LDEBR + let (opcode, m3, m4, m5, opcode_fpr) = match fpu_op { + FPUOp1::Abs32 => (0xe7cc, 2, 8, 2, Some(0xb300)), // WFPSO, LPEBR + FPUOp1::Abs64 => (0xe7cc, 3, 8, 2, Some(0xb310)), // WFPSO, LPDBR + FPUOp1::Abs32x4 => (0xe7cc, 2, 0, 2, None), // VFPSO + FPUOp1::Abs64x2 => (0xe7cc, 3, 0, 2, None), // VFPSO + FPUOp1::Neg32 => (0xe7cc, 2, 8, 0, Some(0xb303)), // WFPSO, LCEBR + FPUOp1::Neg64 => (0xe7cc, 3, 8, 0, Some(0xb313)), // WFPSO, LCDBR + FPUOp1::Neg32x4 => (0xe7cc, 2, 0, 0, None), // VFPSO + FPUOp1::Neg64x2 => (0xe7cc, 3, 0, 0, None), // VFPSO + FPUOp1::NegAbs32 => (0xe7cc, 2, 8, 1, Some(0xb301)), // WFPSO, LNEBR + FPUOp1::NegAbs64 => (0xe7cc, 3, 8, 1, Some(0xb311)), // WFPSO, LNDBR + FPUOp1::NegAbs32x4 => (0xe7cc, 2, 0, 1, None), // VFPSO + FPUOp1::NegAbs64x2 => (0xe7cc, 3, 0, 1, None), // VFPSO + FPUOp1::Sqrt32 => (0xe7ce, 2, 8, 0, Some(0xb314)), // WFSQ, SQEBR + FPUOp1::Sqrt64 => (0xe7ce, 3, 8, 0, Some(0xb315)), // WFSQ, SQDBR + FPUOp1::Sqrt32x4 => (0xe7ce, 2, 0, 0, None), // VFSQ + FPUOp1::Sqrt64x2 => (0xe7ce, 3, 0, 0, None), // VFSQ + FPUOp1::Cvt32To64 => (0xe7c4, 2, 8, 0, Some(0xb304)), // WFLL, LDEBR + FPUOp1::Cvt32x4To64x2 => (0xe7c4, 2, 0, 0, None), // VFLL }; - if is_fpr(rd.to_reg()) && is_fpr(rn) { - put(sink, &enc_rre(opcode_fpr, rd.to_reg(), rn)); + if m4 == 8 && is_fpr(rd.to_reg()) && is_fpr(rn) { + put(sink, &enc_rre(opcode_fpr.unwrap(), rd.to_reg(), rn)); } else { - put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 8, m5)); + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, m4, m5)); } } &Inst::FpuRRR { fpu_op, rd, rn, rm } => { @@ -2258,24 +2332,45 @@ impl MachInstEmit for Inst { let rn = allocs.next(rn); let rm = allocs.next(rm); - let (opcode, m4, m6, opcode_fpr) = match fpu_op { - FPUOp2::Add32 => (0xe7e3, 2, 0, Some(0xb30a)), // VFA, AEBR - FPUOp2::Add64 => (0xe7e3, 3, 0, Some(0xb31a)), // VFA, ADBR - FPUOp2::Sub32 => (0xe7e2, 2, 0, Some(0xb30b)), // VFS, SEBR - FPUOp2::Sub64 => (0xe7e2, 3, 0, Some(0xb31b)), // VFS, SDBR - FPUOp2::Mul32 => (0xe7e7, 2, 0, Some(0xb317)), // VFM, MEEBR - FPUOp2::Mul64 => (0xe7e7, 3, 0, Some(0xb31c)), // VFM, MDBR - FPUOp2::Div32 => (0xe7e5, 2, 0, Some(0xb30d)), // VFD, DEBR - FPUOp2::Div64 => (0xe7e5, 3, 0, Some(0xb31d)), // VFD, DDBR - FPUOp2::Max32 => (0xe7ef, 2, 1, None), // VFMAX - FPUOp2::Max64 => (0xe7ef, 3, 1, None), // VFMAX - FPUOp2::Min32 => (0xe7ee, 2, 1, None), // VFMIN - FPUOp2::Min64 => (0xe7ee, 3, 1, None), // VFMIN + let (opcode, m4, m5, m6, opcode_fpr) = match fpu_op { + FPUOp2::Add32 => (0xe7e3, 2, 8, 0, Some(0xb30a)), // WFA, AEBR + FPUOp2::Add64 => (0xe7e3, 3, 8, 0, Some(0xb31a)), // WFA, ADBR + FPUOp2::Add32x4 => (0xe7e3, 2, 0, 0, None), // VFA + FPUOp2::Add64x2 => (0xe7e3, 3, 0, 0, None), // VFA + FPUOp2::Sub32 => (0xe7e2, 2, 8, 0, Some(0xb30b)), // WFS, SEBR + FPUOp2::Sub64 => (0xe7e2, 3, 8, 0, Some(0xb31b)), // WFS, SDBR + FPUOp2::Sub32x4 => (0xe7e2, 2, 0, 0, None), // VFS + FPUOp2::Sub64x2 => (0xe7e2, 3, 0, 0, None), // VFS + FPUOp2::Mul32 => (0xe7e7, 2, 8, 0, Some(0xb317)), // WFM, MEEBR + FPUOp2::Mul64 => (0xe7e7, 3, 8, 0, Some(0xb31c)), // WFM, MDBR + FPUOp2::Mul32x4 => (0xe7e7, 2, 0, 0, None), // VFM + FPUOp2::Mul64x2 => (0xe7e7, 3, 0, 0, None), // VFM + FPUOp2::Div32 => (0xe7e5, 2, 8, 0, Some(0xb30d)), // WFD, DEBR + FPUOp2::Div64 => (0xe7e5, 3, 8, 0, Some(0xb31d)), // WFD, DDBR + FPUOp2::Div32x4 => (0xe7e5, 2, 0, 0, None), // VFD + FPUOp2::Div64x2 => (0xe7e5, 3, 0, 0, None), // VFD + FPUOp2::Max32 => (0xe7ef, 2, 8, 1, None), // WFMAX + FPUOp2::Max64 => (0xe7ef, 3, 8, 1, None), // WFMAX + FPUOp2::Max32x4 => (0xe7ef, 2, 0, 1, None), // VFMAX + FPUOp2::Max64x2 => (0xe7ef, 3, 0, 1, None), // VFMAX + FPUOp2::Min32 => (0xe7ee, 2, 8, 1, None), // WFMIN + FPUOp2::Min64 => (0xe7ee, 3, 8, 1, None), // WFMIN + FPUOp2::Min32x4 => (0xe7ee, 2, 0, 1, None), // VFMIN + FPUOp2::Min64x2 => (0xe7ee, 3, 0, 1, None), // VFMIN + FPUOp2::MaxPseudo32 => (0xe7ef, 2, 8, 3, None), // WFMAX + FPUOp2::MaxPseudo64 => (0xe7ef, 3, 8, 3, None), // WFMAX + FPUOp2::MaxPseudo32x4 => (0xe7ef, 2, 0, 3, None), // VFMAX + FPUOp2::MaxPseudo64x2 => (0xe7ef, 3, 0, 3, None), // VFMAX + FPUOp2::MinPseudo32 => (0xe7ee, 2, 8, 3, None), // WFMIN + FPUOp2::MinPseudo64 => (0xe7ee, 3, 8, 3, None), // WFMIN + FPUOp2::MinPseudo32x4 => (0xe7ee, 2, 0, 3, None), // VFMIN + FPUOp2::MinPseudo64x2 => (0xe7ee, 3, 0, 3, None), // VFMIN }; - if opcode_fpr.is_some() && rd.to_reg() == rn && is_fpr(rn) && is_fpr(rm) { + if m5 == 8 && opcode_fpr.is_some() && rd.to_reg() == rn && is_fpr(rn) && is_fpr(rm) + { put(sink, &enc_rre(opcode_fpr.unwrap(), rd.to_reg(), rm)); } else { - put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 8, m6)); + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, m5, m6)); } } &Inst::FpuRRRR { @@ -2290,16 +2385,20 @@ impl MachInstEmit for Inst { let rm = allocs.next(rm); let ra = allocs.next(ra); - let (opcode, m6, opcode_fpr) = match fpu_op { - FPUOp3::MAdd32 => (0xe78f, 2, 0xb30e), // VFMA, MAEBR - FPUOp3::MAdd64 => (0xe78f, 3, 0xb31e), // VFMA, MADBR - FPUOp3::MSub32 => (0xe78e, 2, 0xb30f), // VFMS, MSEBR - FPUOp3::MSub64 => (0xe78e, 3, 0xb31f), // VFMS, MSDBR + let (opcode, m5, m6, opcode_fpr) = match fpu_op { + FPUOp3::MAdd32 => (0xe78f, 8, 2, Some(0xb30e)), // WFMA, MAEBR + FPUOp3::MAdd64 => (0xe78f, 8, 3, Some(0xb31e)), // WFMA, MADBR + FPUOp3::MAdd32x4 => (0xe78f, 0, 2, None), // VFMA + FPUOp3::MAdd64x2 => (0xe78f, 0, 3, None), // VFMA + FPUOp3::MSub32 => (0xe78e, 8, 2, Some(0xb30f)), // WFMS, MSEBR + FPUOp3::MSub64 => (0xe78e, 8, 3, Some(0xb31f)), // WFMS, MSDBR + FPUOp3::MSub32x4 => (0xe78e, 0, 2, None), // VFMS + FPUOp3::MSub64x2 => (0xe78e, 0, 3, None), // VFMS }; - if rd.to_reg() == ra && is_fpr(rn) && is_fpr(rm) && is_fpr(ra) { - put(sink, &enc_rrd(opcode_fpr, rd.to_reg(), rm, rn)); + if m5 == 8 && rd.to_reg() == ra && is_fpr(rn) && is_fpr(rm) && is_fpr(ra) { + put(sink, &enc_rrd(opcode_fpr.unwrap(), rd.to_reg(), rm, rn)); } else { - put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 8, m6)); + put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, m5, m6)); } } &Inst::FpuRound { op, mode, rd, rn } => { @@ -2315,26 +2414,37 @@ impl MachInstEmit for Inst { FpuRoundMode::ToPosInfinity => 6, FpuRoundMode::ToNegInfinity => 7, }; - let (opcode, m3, opcode_fpr) = match op { - FpuRoundOp::Cvt64To32 => (0xe7c5, 3, Some(0xb344)), // VFLR, LEDBR(A) - FpuRoundOp::Round32 => (0xe7c7, 2, Some(0xb357)), // VFI, FIEBR - FpuRoundOp::Round64 => (0xe7c7, 3, Some(0xb35f)), // VFI, FIDBR - FpuRoundOp::ToSInt32 => (0xe7c2, 2, None), // VCSFP - FpuRoundOp::ToSInt64 => (0xe7c2, 3, None), // VCSFP - FpuRoundOp::ToUInt32 => (0xe7c0, 2, None), // VCLFP - FpuRoundOp::ToUInt64 => (0xe7c0, 3, None), // VCLFP - FpuRoundOp::FromSInt32 => (0xe7c3, 2, None), // VCFPS - FpuRoundOp::FromSInt64 => (0xe7c3, 3, None), // VCFPS - FpuRoundOp::FromUInt32 => (0xe7c1, 2, None), // VCFPL - FpuRoundOp::FromUInt64 => (0xe7c1, 3, None), // VCFPL + let (opcode, m3, m4, opcode_fpr) = match op { + FpuRoundOp::Cvt64To32 => (0xe7c5, 3, 8, Some(0xb344)), // WFLR, LEDBR(A) + FpuRoundOp::Cvt64x2To32x4 => (0xe7c5, 3, 0, None), // VFLR + FpuRoundOp::Round32 => (0xe7c7, 2, 8, Some(0xb357)), // WFI, FIEBR + FpuRoundOp::Round64 => (0xe7c7, 3, 8, Some(0xb35f)), // WFI, FIDBR + FpuRoundOp::Round32x4 => (0xe7c7, 2, 0, None), // VFI + FpuRoundOp::Round64x2 => (0xe7c7, 3, 0, None), // VFI + FpuRoundOp::ToSInt32 => (0xe7c2, 2, 8, None), // WCSFP + FpuRoundOp::ToSInt64 => (0xe7c2, 3, 8, None), // WCSFP + FpuRoundOp::ToUInt32 => (0xe7c0, 2, 8, None), // WCLFP + FpuRoundOp::ToUInt64 => (0xe7c0, 3, 8, None), // WCLFP + FpuRoundOp::ToSInt32x4 => (0xe7c2, 2, 0, None), // VCSFP + FpuRoundOp::ToSInt64x2 => (0xe7c2, 3, 0, None), // VCSFP + FpuRoundOp::ToUInt32x4 => (0xe7c0, 2, 0, None), // VCLFP + FpuRoundOp::ToUInt64x2 => (0xe7c0, 3, 0, None), // VCLFP + FpuRoundOp::FromSInt32 => (0xe7c3, 2, 8, None), // WCFPS + FpuRoundOp::FromSInt64 => (0xe7c3, 3, 8, None), // WCFPS + FpuRoundOp::FromUInt32 => (0xe7c1, 2, 8, None), // WCFPL + FpuRoundOp::FromUInt64 => (0xe7c1, 3, 8, None), // WCFPL + FpuRoundOp::FromSInt32x4 => (0xe7c3, 2, 0, None), // VCFPS + FpuRoundOp::FromSInt64x2 => (0xe7c3, 3, 0, None), // VCFPS + FpuRoundOp::FromUInt32x4 => (0xe7c1, 2, 0, None), // VCFPL + FpuRoundOp::FromUInt64x2 => (0xe7c1, 3, 0, None), // VCFPL }; - if opcode_fpr.is_some() && is_fpr(rd.to_reg()) && is_fpr(rn) { + if m4 == 8 && opcode_fpr.is_some() && is_fpr(rd.to_reg()) && is_fpr(rn) { put( sink, &enc_rrf_cde(opcode_fpr.unwrap(), rd.to_reg(), rn, mode, 0), ); } else { - put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 8, mode)); + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, m4, mode)); } } &Inst::FpuCmp32 { rn, rm } => { @@ -2361,6 +2471,169 @@ impl MachInstEmit for Inst { put(sink, &enc_vrr_a(opcode, rn, rm, 3, 0, 0)); } } + + &Inst::VecRRR { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecBinaryOp::Add8x16 => (0xe7f3, 0), // VAB + VecBinaryOp::Add16x8 => (0xe7f3, 1), // VAH + VecBinaryOp::Add32x4 => (0xe7f3, 2), // VAF + VecBinaryOp::Add64x2 => (0xe7f3, 3), // VAG + VecBinaryOp::Sub8x16 => (0xe7f7, 0), // VSB + VecBinaryOp::Sub16x8 => (0xe7f7, 1), // VSH + VecBinaryOp::Sub32x4 => (0xe7f7, 2), // VSF + VecBinaryOp::Sub64x2 => (0xe7f7, 3), // VSG + VecBinaryOp::Mul8x16 => (0xe7a2, 0), // VMLB + VecBinaryOp::Mul16x8 => (0xe7a2, 1), // VMLHW + VecBinaryOp::Mul32x4 => (0xe7a2, 2), // VMLF + VecBinaryOp::UMulHi8x16 => (0xe7a1, 0), // VMLHB + VecBinaryOp::UMulHi16x8 => (0xe7a1, 1), // VMLHH + VecBinaryOp::UMulHi32x4 => (0xe7a1, 2), // VMLHF + VecBinaryOp::SMulHi8x16 => (0xe7a3, 0), // VMHB + VecBinaryOp::SMulHi16x8 => (0xe7a3, 1), // VMHH + VecBinaryOp::SMulHi32x4 => (0xe7a3, 2), // VMHF + VecBinaryOp::UMulEven8x16 => (0xe7a4, 0), // VMLEB + VecBinaryOp::UMulEven16x8 => (0xe7a4, 1), // VMLEH + VecBinaryOp::UMulEven32x4 => (0xe7a4, 2), // VMLEF + VecBinaryOp::SMulEven8x16 => (0xe7a6, 0), // VMEB + VecBinaryOp::SMulEven16x8 => (0xe7a6, 1), // VMEH + VecBinaryOp::SMulEven32x4 => (0xe7a6, 2), // VMEF + VecBinaryOp::UMulOdd8x16 => (0xe7a5, 0), // VMLOB + VecBinaryOp::UMulOdd16x8 => (0xe7a5, 1), // VMLOH + VecBinaryOp::UMulOdd32x4 => (0xe7a5, 2), // VMLOF + VecBinaryOp::SMulOdd8x16 => (0xe7a7, 0), // VMOB + VecBinaryOp::SMulOdd16x8 => (0xe7a7, 1), // VMOH + VecBinaryOp::SMulOdd32x4 => (0xe7a7, 2), // VMOF + VecBinaryOp::UMax8x16 => (0xe7fd, 0), // VMXLB + VecBinaryOp::UMax16x8 => (0xe7fd, 1), // VMXLH + VecBinaryOp::UMax32x4 => (0xe7fd, 2), // VMXLF + VecBinaryOp::UMax64x2 => (0xe7fd, 3), // VMXLG + VecBinaryOp::SMax8x16 => (0xe7ff, 0), // VMXB + VecBinaryOp::SMax16x8 => (0xe7ff, 1), // VMXH + VecBinaryOp::SMax32x4 => (0xe7ff, 2), // VMXF + VecBinaryOp::SMax64x2 => (0xe7ff, 3), // VMXG + VecBinaryOp::UMin8x16 => (0xe7fc, 0), // VMNLB + VecBinaryOp::UMin16x8 => (0xe7fc, 1), // VMNLH + VecBinaryOp::UMin32x4 => (0xe7fc, 2), // VMNLF + VecBinaryOp::UMin64x2 => (0xe7fc, 3), // VMNLG + VecBinaryOp::SMin8x16 => (0xe7fe, 0), // VMNB + VecBinaryOp::SMin16x8 => (0xe7fe, 1), // VMNH + VecBinaryOp::SMin32x4 => (0xe7fe, 2), // VMNF + VecBinaryOp::SMin64x2 => (0xe7fe, 3), // VMNG + VecBinaryOp::UAvg8x16 => (0xe7f0, 0), // VAVGLB + VecBinaryOp::UAvg16x8 => (0xe7f0, 1), // VAVGLH + VecBinaryOp::UAvg32x4 => (0xe7f0, 2), // VAVGLF + VecBinaryOp::UAvg64x2 => (0xe7f0, 3), // VAVGLG + VecBinaryOp::SAvg8x16 => (0xe7f2, 0), // VAVGB + VecBinaryOp::SAvg16x8 => (0xe7f2, 1), // VAVGH + VecBinaryOp::SAvg32x4 => (0xe7f2, 2), // VAVGF + VecBinaryOp::SAvg64x2 => (0xe7f2, 3), // VAVGG + VecBinaryOp::And128 => (0xe768, 0), // VN + VecBinaryOp::Orr128 => (0xe76a, 0), // VO + VecBinaryOp::Xor128 => (0xe76d, 0), // VX + VecBinaryOp::NotAnd128 => (0xe76e, 0), // VNN + VecBinaryOp::NotOrr128 => (0xe76b, 0), // VNO + VecBinaryOp::NotXor128 => (0xe76c, 0), // VNX + VecBinaryOp::AndNot128 => (0xe769, 0), // VNC + VecBinaryOp::OrrNot128 => (0xe76f, 0), // VOC + VecBinaryOp::BitPermute128 => (0xe785, 0), // VBPERM + VecBinaryOp::LShLByByte128 => (0xe775, 0), // VSLB + VecBinaryOp::LShRByByte128 => (0xe77d, 0), // VSRLB + VecBinaryOp::AShRByByte128 => (0xe77f, 0), // VSRAB + VecBinaryOp::LShLByBit128 => (0xe774, 0), // VSL + VecBinaryOp::LShRByBit128 => (0xe77c, 0), // VSRL + VecBinaryOp::AShRByBit128 => (0xe77e, 0), // VSRA + VecBinaryOp::Pack16x8 => (0xe794, 1), // VPKH + VecBinaryOp::Pack32x4 => (0xe794, 2), // VPKF + VecBinaryOp::Pack64x2 => (0xe794, 3), // VPKG + VecBinaryOp::PackUSat16x8 => (0xe795, 1), // VPKLSH + VecBinaryOp::PackUSat32x4 => (0xe795, 2), // VPKLSF + VecBinaryOp::PackUSat64x2 => (0xe795, 3), // VPKLSG + VecBinaryOp::PackSSat16x8 => (0xe797, 1), // VPKSH + VecBinaryOp::PackSSat32x4 => (0xe797, 2), // VPKSF + VecBinaryOp::PackSSat64x2 => (0xe797, 3), // VPKSG + VecBinaryOp::MergeLow8x16 => (0xe760, 0), // VMRLB + VecBinaryOp::MergeLow16x8 => (0xe760, 1), // VMRLH + VecBinaryOp::MergeLow32x4 => (0xe760, 2), // VMRLF + VecBinaryOp::MergeLow64x2 => (0xe760, 3), // VMRLG + VecBinaryOp::MergeHigh8x16 => (0xe761, 0), // VMRHB + VecBinaryOp::MergeHigh16x8 => (0xe761, 1), // VMRHH + VecBinaryOp::MergeHigh32x4 => (0xe761, 2), // VMRHF + VecBinaryOp::MergeHigh64x2 => (0xe761, 3), // VMRHG + }; + + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0)); + } + &Inst::VecRR { op, rd, rn } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let (opcode, m3) = match op { + VecUnaryOp::Abs8x16 => (0xe7df, 0), // VLPB + VecUnaryOp::Abs16x8 => (0xe7df, 1), // VLPH + VecUnaryOp::Abs32x4 => (0xe7df, 2), // VLPF + VecUnaryOp::Abs64x2 => (0xe7df, 3), // VLPG + VecUnaryOp::Neg8x16 => (0xe7de, 0), // VLCB + VecUnaryOp::Neg16x8 => (0xe7de, 1), // VLCH + VecUnaryOp::Neg32x4 => (0xe7de, 2), // VLCF + VecUnaryOp::Neg64x2 => (0xe7de, 3), // VLCG + VecUnaryOp::Popcnt8x16 => (0xe750, 0), // VPOPCTB + VecUnaryOp::Popcnt16x8 => (0xe750, 1), // VPOPCTH + VecUnaryOp::Popcnt32x4 => (0xe750, 2), // VPOPCTF + VecUnaryOp::Popcnt64x2 => (0xe750, 3), // VPOPCTG + VecUnaryOp::UnpackULow8x16 => (0xe7d4, 0), // VUPLLB + VecUnaryOp::UnpackULow16x8 => (0xe7d4, 1), // VUPLLH + VecUnaryOp::UnpackULow32x4 => (0xe7d4, 2), // VUPLLF + VecUnaryOp::UnpackUHigh8x16 => (0xe7d5, 0), // VUPLHB + VecUnaryOp::UnpackUHigh16x8 => (0xe7d5, 1), // VUPLHH + VecUnaryOp::UnpackUHigh32x4 => (0xe7d5, 2), // VUPLHF + VecUnaryOp::UnpackSLow8x16 => (0xe7d6, 0), // VUPLB + VecUnaryOp::UnpackSLow16x8 => (0xe7d6, 1), // VUPLH + VecUnaryOp::UnpackSLow32x4 => (0xe7d6, 2), // VUPLF + VecUnaryOp::UnpackSHigh8x16 => (0xe7d7, 0), // VUPHB + VecUnaryOp::UnpackSHigh16x8 => (0xe7d7, 1), // VUPHH + VecUnaryOp::UnpackSHigh32x4 => (0xe7d7, 2), // VUPHF + }; + + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 0, 0)); + } + &Inst::VecShiftRR { + shift_op, + rd, + rn, + shift_imm, + shift_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let shift_reg = allocs.next(shift_reg); + + let (opcode, m4) = match shift_op { + VecShiftOp::RotL8x16 => (0xe733, 0), // VERLLB + VecShiftOp::RotL16x8 => (0xe733, 1), // VERLLH + VecShiftOp::RotL32x4 => (0xe733, 2), // VERLLF + VecShiftOp::RotL64x2 => (0xe733, 3), // VERLLG + VecShiftOp::LShL8x16 => (0xe730, 0), // VESLB + VecShiftOp::LShL16x8 => (0xe730, 1), // VESLH + VecShiftOp::LShL32x4 => (0xe730, 2), // VESLF + VecShiftOp::LShL64x2 => (0xe730, 3), // VESLG + VecShiftOp::LShR8x16 => (0xe738, 0), // VESRLB + VecShiftOp::LShR16x8 => (0xe738, 1), // VESRLH + VecShiftOp::LShR32x4 => (0xe738, 2), // VESRLF + VecShiftOp::LShR64x2 => (0xe738, 3), // VESRLG + VecShiftOp::AShR8x16 => (0xe73a, 0), // VESRAB + VecShiftOp::AShR16x8 => (0xe73a, 1), // VESRAH + VecShiftOp::AShR32x4 => (0xe73a, 2), // VESRAF + VecShiftOp::AShR64x2 => (0xe73a, 3), // VESRAG + }; + put( + sink, + &enc_vrs_a(opcode, rd.to_reg(), shift_reg, shift_imm.into(), rn, m4), + ); + } &Inst::VecSelect { rd, rn, rm, ra } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); @@ -2370,6 +2643,442 @@ impl MachInstEmit for Inst { let opcode = 0xe78d; // VSEL put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); } + &Inst::VecPermute { rd, rn, rm, ra } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + let ra = allocs.next(ra); + + let opcode = 0xe78c; // VPERM + put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); + } + &Inst::VecPermuteDWImm { + rd, + rn, + rm, + idx1, + idx2, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + let m4 = (idx1 & 1) * 4 + (idx2 & 1); + + let opcode = 0xe784; // VPDI + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0)); + } + &Inst::VecIntCmp { op, rd, rn, rm } | &Inst::VecIntCmpS { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecIntCmpOp::CmpEq8x16 => (0xe7f8, 0), // VCEQB + VecIntCmpOp::CmpEq16x8 => (0xe7f8, 1), // VCEQH + VecIntCmpOp::CmpEq32x4 => (0xe7f8, 2), // VCEQF + VecIntCmpOp::CmpEq64x2 => (0xe7f8, 3), // VCEQG + VecIntCmpOp::SCmpHi8x16 => (0xe7fb, 0), // VCHB + VecIntCmpOp::SCmpHi16x8 => (0xe7fb, 1), // VCHH + VecIntCmpOp::SCmpHi32x4 => (0xe7fb, 2), // VCHG + VecIntCmpOp::SCmpHi64x2 => (0xe7fb, 3), // VCHG + VecIntCmpOp::UCmpHi8x16 => (0xe7f9, 0), // VCHLB + VecIntCmpOp::UCmpHi16x8 => (0xe7f9, 1), // VCHLH + VecIntCmpOp::UCmpHi32x4 => (0xe7f9, 2), // VCHLG + VecIntCmpOp::UCmpHi64x2 => (0xe7f9, 3), // VCHLG + }; + let m5 = match self { + &Inst::VecIntCmp { .. } => 0, + &Inst::VecIntCmpS { .. } => 1, + _ => unreachable!(), + }; + + put(sink, &enc_vrr_b(opcode, rd.to_reg(), rn, rm, m4, m5)); + } + &Inst::VecFloatCmp { op, rd, rn, rm } | &Inst::VecFloatCmpS { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecFloatCmpOp::CmpEq32x4 => (0xe7e8, 2), // VFCESB + VecFloatCmpOp::CmpEq64x2 => (0xe7e8, 3), // VFCEDB + VecFloatCmpOp::CmpHi32x4 => (0xe7eb, 2), // VFCHSB + VecFloatCmpOp::CmpHi64x2 => (0xe7eb, 3), // VFCHDB + VecFloatCmpOp::CmpHiEq32x4 => (0xe7ea, 2), // VFCHESB + VecFloatCmpOp::CmpHiEq64x2 => (0xe7ea, 3), // VFCHEDB + }; + let m6 = match self { + &Inst::VecFloatCmp { .. } => 0, + &Inst::VecFloatCmpS { .. } => 1, + _ => unreachable!(), + }; + + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, m6)); + } + + &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match self { + &Inst::VecLoad { .. } => (0xe706, 0), // VL + &Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ + _ => unreachable!(), + }; + mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state); + } + &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + let rd = allocs.next(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match self { + &Inst::VecStore { .. } => (0xe70e, 0), // VST + &Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ + _ => unreachable!(), + }; + mem_vrx_emit(rd, &mem, opcode, m3, true, sink, emit_info, state); + } + &Inst::VecLoadReplicate { size, rd, ref mem } + | &Inst::VecLoadReplicateRev { size, rd, ref mem } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match (self, size) { + (&Inst::VecLoadReplicate { .. }, 8) => (0xe705, 0), // VLREPB + (&Inst::VecLoadReplicate { .. }, 16) => (0xe705, 1), // VLREPH + (&Inst::VecLoadReplicate { .. }, 32) => (0xe705, 2), // VLREPF + (&Inst::VecLoadReplicate { .. }, 64) => (0xe705, 3), // VLREPG + (&Inst::VecLoadReplicateRev { .. }, 16) => (0xe605, 1), // VLREPBRH + (&Inst::VecLoadReplicateRev { .. }, 32) => (0xe605, 2), // VLREPBRF + (&Inst::VecLoadReplicateRev { .. }, 64) => (0xe605, 3), // VLREPBRG + _ => unreachable!(), + }; + mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state); + } + + &Inst::VecMov { rd, rn } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let opcode = 0xe756; // VLR + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0)); + } + &Inst::VecCMov { rd, cond, rm } => { + let rd = allocs.next_writable(rd); + let rm = allocs.next(rm); + + let opcode = 0xa74; // BCR + put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6)); + let opcode = 0xe756; // VLR + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0)); + } + &Inst::MovToVec128 { rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let opcode = 0xe762; // VLVGP + put(sink, &enc_vrr_f(opcode, rd.to_reg(), rn, rm)); + } + &Inst::VecLoadConst { rd, const_data } => { + let rd = allocs.next_writable(rd); + + let opcode = 0xa75; // BRAS + let reg = writable_spilltmp_reg().to_reg(); + put(sink, &enc_ri_b(opcode, reg, 20)); + for i in const_data.to_be_bytes().iter() { + sink.put1(*i); + } + let inst = Inst::VecLoad { + rd, + mem: MemArg::reg(reg, MemFlags::trusted()), + }; + inst.emit(&[], sink, emit_info, state); + } + &Inst::VecLoadConstReplicate { + size, + rd, + const_data, + } => { + let rd = allocs.next_writable(rd); + + let opcode = 0xa75; // BRAS + let reg = writable_spilltmp_reg().to_reg(); + put(sink, &enc_ri_b(opcode, reg, (4 + size / 8) as i32)); + for i in 0..size / 8 { + sink.put1((const_data >> (size - 8 - 8 * i)) as u8); + } + let inst = Inst::VecLoadReplicate { + size, + rd, + mem: MemArg::reg(reg, MemFlags::trusted()), + }; + inst.emit(&[], sink, emit_info, state); + } + &Inst::VecImmByteMask { rd, mask } => { + let rd = allocs.next_writable(rd); + let opcode = 0xe744; // VGBM + put(sink, &enc_vri_a(opcode, rd.to_reg(), mask, 0)); + } + &Inst::VecImmBitMask { + size, + rd, + start_bit, + end_bit, + } => { + let rd = allocs.next_writable(rd); + let (opcode, m4) = match size { + 8 => (0xe746, 0), // VGMB + 16 => (0xe746, 1), // VGMH + 32 => (0xe746, 2), // VGMF + 64 => (0xe746, 3), // VGMG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_b(opcode, rd.to_reg(), start_bit, end_bit, m4), + ); + } + &Inst::VecImmReplicate { size, rd, imm } => { + let rd = allocs.next_writable(rd); + let (opcode, m3) = match size { + 8 => (0xe745, 0), // VREPIB + 16 => (0xe745, 1), // VREPIH + 32 => (0xe745, 2), // VREPIF + 64 => (0xe745, 3), // VREPIG + _ => unreachable!(), + }; + put(sink, &enc_vri_a(opcode, rd.to_reg(), imm as u16, m3)); + } + + &Inst::VecLoadLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneUndef { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRev { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRevUndef { + size, + rd, + ref mem, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecLoadLane { .. }, 8) => (0xe700, None, None), // VLEB + (&Inst::VecLoadLane { .. }, 16) => (0xe701, None, None), // VLEH + (&Inst::VecLoadLane { .. }, 32) => (0xe703, None, None), // VLEF + (&Inst::VecLoadLane { .. }, 64) => (0xe702, None, None), // VLEG + (&Inst::VecLoadLaneUndef { .. }, 8) => (0xe700, None, None), // VLEB + (&Inst::VecLoadLaneUndef { .. }, 16) => (0xe701, None, None), // VLEH + (&Inst::VecLoadLaneUndef { .. }, 32) => (0xe703, Some(0x78), Some(0xed64)), // VLEF, LE(Y) + (&Inst::VecLoadLaneUndef { .. }, 64) => (0xe702, Some(0x68), Some(0xed65)), // VLEG, LD(Y) + (&Inst::VecLoadLaneRev { .. }, 16) => (0xe601, None, None), // VLEBRH + (&Inst::VecLoadLaneRev { .. }, 32) => (0xe603, None, None), // VLEBRF + (&Inst::VecLoadLaneRev { .. }, 64) => (0xe602, None, None), // VLEBRG + (&Inst::VecLoadLaneRevUndef { .. }, 16) => (0xe601, None, None), // VLEBRH + (&Inst::VecLoadLaneRevUndef { .. }, 32) => (0xe603, None, None), // VLEBRF + (&Inst::VecLoadLaneRevUndef { .. }, 64) => (0xe602, None, None), // VLEBRG + _ => unreachable!(), + }; + + let rd = rd.to_reg(); + if lane_imm == 0 && is_fpr(rd) && opcode_rx.is_some() { + mem_emit( + rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, + ); + } else { + mem_vrx_emit( + rd, + &mem, + opcode_vrx, + lane_imm.into(), + true, + sink, + emit_info, + state, + ); + } + } + &Inst::VecStoreLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecStoreLaneRev { + size, + rd, + ref mem, + lane_imm, + } => { + let rd = allocs.next(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecStoreLane { .. }, 8) => (0xe708, None, None), // VSTEB + (&Inst::VecStoreLane { .. }, 16) => (0xe709, None, None), // VSTEH + (&Inst::VecStoreLane { .. }, 32) => (0xe70b, Some(0x70), Some(0xed66)), // VSTEF, STE(Y) + (&Inst::VecStoreLane { .. }, 64) => (0xe70a, Some(0x60), Some(0xed67)), // VSTEG, STD(Y) + (&Inst::VecStoreLaneRev { .. }, 16) => (0xe609, None, None), // VSTEBRH + (&Inst::VecStoreLaneRev { .. }, 32) => (0xe60b, None, None), // VSTEBRF + (&Inst::VecStoreLaneRev { .. }, 64) => (0xe60a, None, None), // VSTEBRG + _ => unreachable!(), + }; + + if lane_imm == 0 && is_fpr(rd) && opcode_rx.is_some() { + mem_emit( + rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, + ); + } else { + mem_vrx_emit( + rd, + &mem, + opcode_vrx, + lane_imm.into(), + true, + sink, + emit_info, + state, + ); + } + } + &Inst::VecInsertLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4) = match size { + 8 => (0xe722, 0), // VLVGB + 16 => (0xe722, 1), // VLVGH + 32 => (0xe722, 2), // VLVGF + 64 => (0xe722, 3), // VLVGG + _ => unreachable!(), + }; + put( + sink, + &enc_vrs_b(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + &Inst::VecInsertLaneUndef { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4, opcode_rre) = match size { + 8 => (0xe722, 0, None), // VLVGB + 16 => (0xe722, 1, None), // VLVGH + 32 => (0xe722, 2, None), // VLVGF + 64 => (0xe722, 3, Some(0xb3c1)), // VLVGG, LDGR + _ => unreachable!(), + }; + if opcode_rre.is_some() + && lane_imm == 0 + && lane_reg == zero_reg() + && is_fpr(rd.to_reg()) + { + put(sink, &enc_rre(opcode_rre.unwrap(), rd.to_reg(), rn)); + } else { + put( + sink, + &enc_vrs_b(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + } + &Inst::VecExtractLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4, opcode_rre) = match size { + 8 => (0xe721, 0, None), // VLGVB + 16 => (0xe721, 1, None), // VLGVH + 32 => (0xe721, 2, None), // VLGVF + 64 => (0xe721, 3, Some(0xb3cd)), // VLGVG, LGDR + _ => unreachable!(), + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg == zero_reg() && is_fpr(rn) { + put(sink, &enc_rre(opcode_rre.unwrap(), rd.to_reg(), rn)); + } else { + put( + sink, + &enc_vrs_c(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + } + &Inst::VecInsertLaneImm { + size, + rd, + imm, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + + let opcode = match size { + 8 => 0xe740, // VLEIB + 16 => 0xe741, // LEIVH + 32 => 0xe743, // VLEIF + 64 => 0xe742, // VLEIG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_a(opcode, rd.to_reg(), imm as u16, lane_imm.into()), + ); + } + &Inst::VecReplicateLane { + size, + rd, + rn, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let (opcode, m4) = match size { + 8 => (0xe74d, 0), // VREPB + 16 => (0xe74d, 1), // VREPH + 32 => (0xe74d, 2), // VREPF + 64 => (0xe74d, 3), // VREPG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_c(opcode, rd.to_reg(), lane_imm.into(), rn, m4), + ); + } &Inst::Call { link, ref info } => { let link = allocs.next_writable(link); diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index a398c798a958..5dd423801fad 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -6808,6 +6808,8 @@ fn test_s390x_binemit() { defs: smallvec![], clobbers: PRegSet::empty(), opcode: Opcode::Call, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, }), }, "C0E500000000", @@ -6823,6 +6825,8 @@ fn test_s390x_binemit() { defs: smallvec![], clobbers: PRegSet::empty(), opcode: Opcode::CallIndirect, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, }), }, "0DE1", @@ -6953,71 +6957,6 @@ fn test_s390x_binemit() { "jno 10 ; vlr %v8, %v20", )); - insns.push(( - Inst::MovToFpr64 { - rd: writable_vr(8), - rn: gpr(4), - }, - "B3C10084", - "ldgr %f8, %r4", - )); - insns.push(( - Inst::MovToFpr64 { - rd: writable_vr(24), - rn: gpr(4), - }, - "E78400003822", - "vlvgg %v24, %r4, 0", - )); - insns.push(( - Inst::MovToFpr32 { - rd: writable_vr(8), - rn: gpr(4), - }, - "E78400002022", - "vlvgf %v8, %r4, 0", - )); - insns.push(( - Inst::MovToFpr32 { - rd: writable_vr(24), - rn: gpr(4), - }, - "E78400002822", - "vlvgf %v24, %r4, 0", - )); - insns.push(( - Inst::MovFromFpr64 { - rd: writable_gpr(8), - rn: vr(4), - }, - "B3CD0084", - "lgdr %r8, %f4", - )); - insns.push(( - Inst::MovFromFpr64 { - rd: writable_gpr(8), - rn: vr(20), - }, - "E78400003421", - "vlgvg %r8, %v20, 0", - )); - insns.push(( - Inst::MovFromFpr32 { - rd: writable_gpr(8), - rn: vr(4), - }, - "E78400002021", - "vlgvf %r8, %v4, 0", - )); - insns.push(( - Inst::MovFromFpr32 { - rd: writable_gpr(8), - rn: vr(20), - }, - "E78400002421", - "vlgvf %r8, %v20, 0", - )); - insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs32, @@ -7036,6 +6975,15 @@ fn test_s390x_binemit() { "E78C002828CC", "wflpsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C002028CC", + "vflpsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs64, @@ -7054,6 +7002,15 @@ fn test_s390x_binemit() { "E78C002838CC", "wflpdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C002038CC", + "vflpdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Neg32, @@ -7072,6 +7029,15 @@ fn test_s390x_binemit() { "E78C000828CC", "wflcsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028CC", + "vflcsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Neg64, @@ -7090,6 +7056,15 @@ fn test_s390x_binemit() { "E78C000838CC", "wflcdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000038CC", + "vflcdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::NegAbs32, @@ -7108,6 +7083,15 @@ fn test_s390x_binemit() { "E78C001828CC", "wflnsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::NegAbs32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028CC", + "vflnsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::NegAbs64, @@ -7126,6 +7110,15 @@ fn test_s390x_binemit() { "E78C001838CC", "wflndb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::NegAbs64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038CC", + "vflndb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Sqrt32, @@ -7144,6 +7137,15 @@ fn test_s390x_binemit() { "E78C000828CE", "wfsqsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028CE", + "vfsqsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Sqrt64, @@ -7162,6 +7164,15 @@ fn test_s390x_binemit() { "E78C000838CE", "wfsqdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000038CE", + "vfsqdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Cvt32To64, @@ -7180,6 +7191,15 @@ fn test_s390x_binemit() { "E78C000828C4", "wldeb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt32x4To64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028C4", + "vldeb %v24, %v12", + )); insns.push(( Inst::FpuRRR { @@ -7201,6 +7221,16 @@ fn test_s390x_binemit() { "E748C00828E3", "wfasb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E3", + "vfasb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Add64, @@ -7221,6 +7251,16 @@ fn test_s390x_binemit() { "E748C00838E3", "wfadb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E3", + "vfadb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Sub32, @@ -7241,6 +7281,16 @@ fn test_s390x_binemit() { "E748C00828E2", "wfssb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E2", + "vfssb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Sub64, @@ -7261,6 +7311,16 @@ fn test_s390x_binemit() { "E748C00838E2", "wfsdb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E2", + "vfsdb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Mul32, @@ -7281,6 +7341,16 @@ fn test_s390x_binemit() { "E748C00828E7", "wfmsb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E7", + "vfmsb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Mul64, @@ -7301,6 +7371,16 @@ fn test_s390x_binemit() { "E748C00838E7", "wfmdb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E7", + "vfmdb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Div32, @@ -7321,6 +7401,16 @@ fn test_s390x_binemit() { "E748C00828E5", "wfdsb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E5", + "vfdsb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Div64, @@ -7341,6 +7431,16 @@ fn test_s390x_binemit() { "E748C00838E5", "wfddb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E5", + "vfddb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Max32, @@ -7351,6 +7451,16 @@ fn test_s390x_binemit() { "E746801820EF", "wfmaxsb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801020EF", + "vfmaxsb %v4, %v6, %v8, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Max64, @@ -7361,6 +7471,16 @@ fn test_s390x_binemit() { "E746801832EF", "wfmaxdb %f4, %f6, %v24, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746801032EF", + "vfmaxdb %v4, %v6, %v24, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Min32, @@ -7371,6 +7491,16 @@ fn test_s390x_binemit() { "E746801820EE", "wfminsb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801020EE", + "vfminsb %v4, %v6, %v8, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Min64, @@ -7381,6 +7511,96 @@ fn test_s390x_binemit() { "E746801830EE", "wfmindb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801030EE", + "vfmindb %v4, %v6, %v8, 1", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo32, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803820EF", + "wfmaxsb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803020EF", + "vfmaxsb %v4, %v6, %v8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo64, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746803832EF", + "wfmaxdb %f4, %f6, %v24, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746803032EF", + "vfmaxdb %v4, %v6, %v24, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo32, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803820EE", + "wfminsb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803020EE", + "vfminsb %v4, %v6, %v8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo64, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803830EE", + "wfmindb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803030EE", + "vfmindb %v4, %v6, %v8, 3", + )); insns.push(( Inst::FpuRRRR { @@ -7404,6 +7624,17 @@ fn test_s390x_binemit() { "E78CD208418F", "wfmasb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd32x4, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD200418F", + "vfmasb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuRRRR { fpu_op: FPUOp3::MAdd64, @@ -7428,14 +7659,25 @@ fn test_s390x_binemit() { )); insns.push(( Inst::FpuRRRR { - fpu_op: FPUOp3::MSub32, + fpu_op: FPUOp3::MAdd64x2, rd: writable_vr(8), rn: vr(12), rm: vr(13), - ra: vr(8), + ra: vr(20), }, - "B30F80CD", - "msebr %f8, %f12, %f13", + "E78CD300418F", + "vfmadb %v8, %v12, %v13, %v20", + )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub32, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(8), + }, + "B30F80CD", + "msebr %f8, %f12, %f13", )); insns.push(( Inst::FpuRRRR { @@ -7448,6 +7690,17 @@ fn test_s390x_binemit() { "E78CD208418E", "wfmssb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub32x4, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD200418E", + "vfmssb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuRRRR { fpu_op: FPUOp3::MSub64, @@ -7470,6 +7723,17 @@ fn test_s390x_binemit() { "E78CD308418E", "wfmsdb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub64x2, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD300418E", + "vfmsdb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuCmp32 { @@ -7505,202 +7769,2982 @@ fn test_s390x_binemit() { )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst32 { + rd: writable_vr(8), + const_data: 1.0_f32.to_bits(), }, - "78102000", - "le %f1, 0(%r2)", + "A71500043F80000078801000", + "bras %r1, 8 ; data.f32 1 ; le %f8, 0(%r1)", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst32 { + rd: writable_vr(24), + const_data: 1.0_f32.to_bits(), }, - "78102FFF", - "le %f1, 4095(%r2)", + "A71500043F800000E78010000803", + "bras %r1, 8 ; data.f32 1 ; vlef %v24, 0(%r1), 0", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst64 { + rd: writable_vr(8), + const_data: 1.0_f64.to_bits(), }, - "ED1020008064", - "ley %f1, -524288(%r2)", + "A71500063FF000000000000068801000", + "bras %r1, 12 ; data.f64 1 ; ld %f8, 0(%r1)", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(524287).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst64 { + rd: writable_vr(24), + const_data: 1.0_f64.to_bits(), }, - "ED102FFF7F64", - "ley %f1, 524287(%r2)", + "A71500063FF0000000000000E78010000802", + "bras %r1, 12 ; data.f64 1 ; vleg %v24, 0(%r1), 0", )); + insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64To32, + mode: FpuRoundMode::Current, + rd: writable_vr(8), + rn: vr(12), }, - "E71020000803", - "vlef %v17, 0(%r2), 0", + "B344008C", + "ledbra %f8, %f12, 0", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64To32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), }, - "E7102FFF0803", - "vlef %v17, 4095(%r2), 0", + "E78C001838C5", + "wledb %v24, %f12, 0, 1", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64x2To32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), }, - "78123000", - "le %f1, 0(%r2,%r3)", + "E78C001038C5", + "vledb %v24, %v12, 0, 1", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNegInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "78123FFF", - "le %f1, 4095(%r2,%r3)", + "B357708C", + "fiebr %f8, %f12, 7", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNegInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "ED1230008064", - "ley %f1, -524288(%r2,%r3)", + "B35F708C", + "fidbr %f8, %f12, 7", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(524287).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToPosInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "ED123FFF7F64", - "ley %f1, 524287(%r2,%r3)", + "B357608C", + "fiebr %f8, %f12, 6", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToPosInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "E71230000803", - "vlef %v17, 0(%r2,%r3), 0", + "B35F608C", + "fidbr %f8, %f12, 6", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToZero, + rd: writable_vr(8), + rn: vr(12), }, - "E7123FFF0803", - "vlef %v17, 4095(%r2,%r3), 0", + "B357508C", + "fiebr %f8, %f12, 5", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToZero, + rd: writable_vr(8), + rn: vr(12), }, - "68102000", - "ld %f1, 0(%r2)", + "B35F508C", + "fidbr %f8, %f12, 5", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNearestTiesToEven, + rd: writable_vr(8), + rn: vr(12), }, - "68102FFF", - "ld %f1, 4095(%r2)", + "B357408C", + "fiebr %f8, %f12, 4", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNearestTiesToEven, + rd: writable_vr(8), + rn: vr(12), }, - "ED1020008065", - "ldy %f1, -524288(%r2)", - )); + "B35F408C", + "fidbr %f8, %f12, 4", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C7", + "wfisb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C7", + "vfisb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C7", + "wfidb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C7", + "vfidb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C2", + "wcfeb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C2", + "vcfeb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C2", + "wcgdb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C2", + "vcgdb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C0", + "wclfeb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C0", + "vclfeb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C0", + "wclgdb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C0", + "vclgdb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C3", + "wcefb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C3", + "vcefb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C3", + "wcdgb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C3", + "vcdgb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C1", + "wcelfb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C1", + "vcelfb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C1", + "wcdlgb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C1", + "vcdlgb %v24, %v12, 0, 1", + )); + + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F3", + "vab %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F3", + "vah %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F3", + "vaf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F3", + "vag %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F7", + "vsb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F7", + "vsh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F7", + "vsf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F7", + "vsg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A2", + "vmlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A2", + "vmlhw %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A2", + "vmlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A1", + "vmlhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A1", + "vmlhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A1", + "vmlhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A3", + "vmhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A3", + "vmhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A3", + "vmhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A4", + "vmleb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A4", + "vmleh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A4", + "vmlef %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A6", + "vmeb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A6", + "vmeh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A6", + "vmef %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A5", + "vmlob %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A5", + "vmloh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A5", + "vmlof %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A7", + "vmob %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A7", + "vmoh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A7", + "vmof %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FD", + "vmxlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FD", + "vmxlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FD", + "vmxlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FD", + "vmxlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FF", + "vmxb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FF", + "vmxh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FF", + "vmxf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FF", + "vmxg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FC", + "vmnlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FC", + "vmnlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FC", + "vmnlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FC", + "vmnlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FE", + "vmnb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FE", + "vmnh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FE", + "vmnf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FE", + "vmng %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F0", + "vavglb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F0", + "vavglh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F0", + "vavglf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F0", + "vavglg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F2", + "vavgb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F2", + "vavgh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F2", + "vavgf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F2", + "vavgg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::And128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000868", + "vn %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Orr128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086A", + "vo %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Xor128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086D", + "vx %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotAnd128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086E", + "vnn %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotOrr128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086B", + "vno %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotXor128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086C", + "vnx %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AndNot128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000869", + "vnc %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::OrrNot128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086F", + "voc %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::BitPermute128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000885", + "vbperm %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShLByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000875", + "vslb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShRByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087D", + "vsrlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AShRByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087F", + "vsrab %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShLByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000874", + "vsl %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShRByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087C", + "vsrl %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AShRByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087E", + "vsra %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001894", + "vpkh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002894", + "vpkf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003894", + "vpkg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001895", + "vpklsh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002895", + "vpklsf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003895", + "vpklsg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001897", + "vpksh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002897", + "vpksf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003897", + "vpksg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000860", + "vmrlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001860", + "vmrlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002860", + "vmrlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003860", + "vmrlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000861", + "vmrhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001861", + "vmrhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002861", + "vmrhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003861", + "vmrhg %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008DF", + "vlpb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018DF", + "vlph %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028DF", + "vlpf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038DF", + "vlpg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008DE", + "vlcb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018DE", + "vlch %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028DE", + "vlcf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038DE", + "vlcg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800000850", + "vpopctb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800001850", + "vpopcth %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800002850", + "vpopctf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800003850", + "vpopctg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D4", + "vupllb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D4", + "vupllh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D4", + "vupllf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D5", + "vuplhb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D5", + "vuplhh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D5", + "vuplhf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D6", + "vuplb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D6", + "vuplh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D6", + "vuplf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D7", + "vuphb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D7", + "vuphh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D7", + "vuphf %v20, %v8", + )); + + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030833", + "verllb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031833", + "verllh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032833", + "verllf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033833", + "verllg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030830", + "veslb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031830", + "veslh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032830", + "veslf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033830", + "veslg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030838", + "vesrlb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031838", + "vesrlh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032838", + "vesrlf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033838", + "vesrlg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003083A", + "vesrab %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003183A", + "vesrah %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003283A", + "vesraf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003383A", + "vesrag %v20, %v5, 3(%r6)", + )); + + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A08D", + "vsel %v4, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A88D", + "vsel %v20, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(22), + rm: vr(8), + ra: vr(10), + }, + "E7468000A48D", + "vsel %v4, %v22, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + ra: vr(10), + }, + "E7468000A28D", + "vsel %v4, %v6, %v24, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(26), + }, + "E7468000A18D", + "vsel %v4, %v6, %v8, %v26", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(20), + rn: vr(22), + rm: vr(24), + ra: vr(26), + }, + "E7468000AF8D", + "vsel %v20, %v22, %v24, %v26", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A08C", + "vperm %v4, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A88C", + "vperm %v20, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(22), + rm: vr(8), + ra: vr(10), + }, + "E7468000A48C", + "vperm %v4, %v22, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + ra: vr(10), + }, + "E7468000A28C", + "vperm %v4, %v6, %v24, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(26), + }, + "E7468000A18C", + "vperm %v4, %v6, %v8, %v26", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(20), + rn: vr(22), + rm: vr(24), + ra: vr(26), + }, + "E7468000AF8C", + "vperm %v20, %v22, %v24, %v26", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 0, + idx2: 0, + }, + "E74680000884", + "vpdi %v20, %v6, %v8, 0", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 0, + idx2: 1, + }, + "E74680001884", + "vpdi %v20, %v6, %v8, 1", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 1, + idx2: 0, + }, + "E74680004884", + "vpdi %v20, %v6, %v8, 4", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 1, + idx2: 1, + }, + "E74680005884", + "vpdi %v20, %v6, %v8, 5", + )); + + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F8", + "vceqb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F8", + "vceqh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F8", + "vceqf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F8", + "vceqg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FB", + "vchb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FB", + "vchh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FB", + "vchf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FB", + "vchg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F9", + "vchlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F9", + "vchlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F9", + "vchlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F9", + "vchlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008F8", + "vceqbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018F8", + "vceqhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028F8", + "vceqfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038F8", + "vceqgs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008FB", + "vchbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018FB", + "vchhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028FB", + "vchfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038FB", + "vchgs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008F9", + "vchlbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018F9", + "vchlhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028F9", + "vchlfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038F9", + "vchlgs %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E8", + "vfcesb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E8", + "vfcedb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028EB", + "vfchsb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038EB", + "vfchdb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHiEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028EA", + "vfchesb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHiEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038EA", + "vfchedb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028E8", + "vfcesbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038E8", + "vfcedbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028EB", + "vfchsbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038EB", + "vfchdbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHiEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028EA", + "vfchesbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHiEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038EA", + "vfchedbs %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E71020000806", + "vl %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E7102FFF0806", + "vl %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E71230000806", + "vl %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020004806", + "vlbrq %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF4806", + "vlbrq %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230004806", + "vlbrq %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E7102000080E", + "vst %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E7102FFF080E", + "vst %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E7123000080E", + "vst %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000480E", + "vstbrq %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF480E", + "vstbrq %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000480E", + "vstbrq %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020800805", + "vlrepb %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020801805", + "vlreph %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020802805", + "vlrepf %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020803805", + "vlrepg %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020801805", + "vlbrreph %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020802805", + "vlbrrepf %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020803805", + "vlbrrepg %v17, 128(%r2)", + )); + + insns.push(( + Inst::VecMov { + rd: writable_vr(8), + rn: vr(20), + }, + "E78400000456", + "vlr %v8, %v20", + )); + insns.push(( + Inst::VecCMov { + rd: writable_vr(8), + rm: vr(20), + cond: Cond::from_mask(1), + }, + "A7E40005E78400000456", + "jno 10 ; vlr %v8, %v20", + )); + insns.push(( + Inst::MovToVec128 { + rd: writable_vr(20), + rn: gpr(5), + rm: gpr(6), + }, + "E74560000862", + "vlvgp %v20, %r5, %r6", + )); + insns.push(( + Inst::VecLoadConst { + rd: writable_vr(24), + const_data: 0x0102030405060708090a0b0c0d0e0fu128, + }, + "A715000A000102030405060708090A0B0C0D0E0FE78010000806", + "bras %r1, 20 ; data.u128 0x000102030405060708090a0b0c0d0e0f ; vl %v24, 0(%r1)", + )); + insns.push(( + Inst::VecLoadConstReplicate { + size: 64, + rd: writable_vr(24), + const_data: 0x01020304050607u64, + }, + "A71500060001020304050607E78010003805", + "bras %r1, 12 ; data.u64 0x0001020304050607 ; vlrepg %v24, 0(%r1)", + )); + insns.push(( + Inst::VecLoadConstReplicate { + size: 32, + rd: writable_vr(24), + const_data: 0x010203u64, + }, + "A715000400010203E78010002805", + "bras %r1, 8 ; data.u32 0x00010203 ; vlrepf %v24, 0(%r1)", + )); + + insns.push(( + Inst::VecImmByteMask { + rd: writable_vr(20), + mask: 0x1234, + }, + "E74012340844", + "vgbm %v20, 4660", + )); + insns.push(( + Inst::VecImmBitMask { + size: 8, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001070846", + "vgmb %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 16, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001071846", + "vgmh %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 32, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001072846", + "vgmf %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 64, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001073846", + "vgmg %v20, 1, 7", + )); + insns.push(( + Inst::VecImmReplicate { + size: 8, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012340845", + "vrepib %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 16, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012341845", + "vrepih %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 32, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012342845", + "vrepif %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 64, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012343845", + "vrepig %v20, 4660", + )); + + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7102000F800", + "vleb %v17, 0(%r2), 15", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0800", + "vleb %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7123000F800", + "vleb %v17, 0(%r2,%r3), 15", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0800", + "vleb %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E71020007801", + "vleh %v17, 0(%r2), 7", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0801", + "vleh %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E71230007801", + "vleh %v17, 0(%r2,%r3), 7", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0801", + "vleh %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 3, + }, + "E71020003803", + "vlef %v17, 0(%r2), 3", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0803", + "vlef %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 3, + }, + "E71230003803", + "vlef %v17, 0(%r2,%r3), 3", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0803", + "vlef %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 1, + }, + "E71020001802", + "vleg %v17, 0(%r2), 1", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0802", + "vleg %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 1, + }, + "E71230001802", + "vleg %v17, 0(%r2,%r3), 1", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0802", + "vleg %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78102000", + "le %f1, 0(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78102FFF", + "le %f1, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1020008064", + "ley %f1, -524288(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED102FFF7F64", + "ley %f1, 524287(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71020000803", + "vlef %v17, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0803", + "vlef %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78123000", + "le %f1, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78123FFF", + "le %f1, 4095(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1230008064", + "ley %f1, -524288(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED123FFF7F64", + "ley %f1, 524287(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71230000803", + "vlef %v17, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0803", + "vlef %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68102000", + "ld %f1, 0(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68102FFF", + "ld %f1, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1020008065", + "ldy %f1, -524288(%r2)", + )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7708,12 +10752,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F65", "ldy %f1, 524287(%r2)", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7721,12 +10767,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E71020000802", "vleg %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7734,12 +10782,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF0802", "vleg %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7747,77 +10797,209 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "68123000", "ld %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68123FFF", + "ld %f1, 4095(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1230008065", + "ldy %f1, -524288(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED123FFF7F65", + "ldy %f1, 524287(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71230000802", + "vleg %v17, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0802", + "vleg %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7102000F808", + "vsteb %v17, 0(%r2), 15", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0808", + "vsteb %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7123000F808", + "vsteb %v17, 0(%r2,%r3), 15", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "68123FFF", - "ld %f1, 4095(%r2,%r3)", + "E7123FFF0808", + "vsteb %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(-524288).unwrap(), + Inst::VecStoreLane { + size: 16, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 7, }, - "ED1230008065", - "ldy %f1, -524288(%r2,%r3)", + "E71020007809", + "vsteh %v17, 0(%r2), 7", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(524287).unwrap(), + Inst::VecStoreLane { + size: 16, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "ED123FFF7F65", - "ldy %f1, 524287(%r2,%r3)", + "E7102FFF0809", + "vsteh %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(17), + Inst::VecStoreLane { + size: 16, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 7, }, - "E71230000802", - "vleg %v17, 0(%r2,%r3), 0", + "E71230007809", + "vsteh %v17, 0(%r2,%r3), 7", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(17), + Inst::VecStoreLane { + size: 16, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "E7123FFF0802", - "vleg %v17, 4095(%r2,%r3), 0", + "E7123FFF0809", + "vsteh %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7825,12 +11007,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70102000", "ste %f1, 0(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7838,12 +11022,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70102FFF", "ste %f1, 4095(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7851,12 +11037,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1020008066", "stey %f1, -524288(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7864,12 +11052,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F66", "stey %f1, 524287(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7877,12 +11067,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102000080B", "vstef %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7890,12 +11082,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF080B", "vstef %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7903,12 +11097,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70123000", "ste %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7916,12 +11112,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70123FFF", "ste %f1, 4095(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -7929,12 +11127,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1230008066", "stey %f1, -524288(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -7942,12 +11142,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED123FFF7F66", "stey %f1, 524287(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -7955,12 +11157,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123000080B", "vstef %v17, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -7968,12 +11172,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123FFF080B", "vstef %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7981,12 +11187,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60102000", "std %f1, 0(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7994,12 +11202,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60102FFF", "std %f1, 4095(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8007,12 +11217,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1020008067", "stdy %f1, -524288(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8020,12 +11232,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F67", "stdy %f1, 524287(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -8033,12 +11247,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102000080A", "vsteg %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -8046,12 +11262,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF080A", "vsteg %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8059,12 +11277,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60123000", "std %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8072,12 +11292,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60123FFF", "std %f1, 4095(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8085,12 +11307,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1230008067", "stdy %f1, -524288(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8098,12 +11322,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED123FFF7F67", "stdy %f1, 524287(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -8111,12 +11337,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123000080A", "vsteg %v17, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -8124,13 +11352,194 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123FFF080A", "vsteg %v17, 4095(%r2,%r3), 0", )); - insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000001", + "vlebrh %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0001", + "vlebrh %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000001", + "vlebrh %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0001", + "vlebrh %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000003", + "vlebrf %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0003", + "vlebrf %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000003", + "vlebrf %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0003", + "vlebrf %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000002", + "vlebrg %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0002", + "vlebrg %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000002", + "vlebrg %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0002", + "vlebrg %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8138,12 +11547,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61020000003", - "vlebrf %f1, 0(%r2), 0", + "vlebrf %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8151,12 +11562,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF0003", - "vlebrf %f1, 4095(%r2), 0", + "vlebrf %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8164,12 +11577,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E61010000003", - "lay %r1, -524288(%r2) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8177,12 +11592,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E61010000003", - "lay %r1, 524287(%r2) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8190,12 +11607,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61230000003", - "vlebrf %f1, 0(%r2,%r3), 0", + "vlebrf %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8203,12 +11622,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF0003", - "vlebrf %f1, 4095(%r2,%r3), 0", + "vlebrf %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8216,12 +11637,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E61010000003", - "lay %r1, -524288(%r2,%r3) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8229,12 +11652,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E61010000003", - "lay %r1, 524287(%r2,%r3) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8242,12 +11667,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61020000002", - "vlebrg %f1, 0(%r2), 0", + "vlebrg %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8255,12 +11682,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF0002", - "vlebrg %f1, 4095(%r2), 0", + "vlebrg %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8268,12 +11697,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E61010000002", - "lay %r1, -524288(%r2) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8281,12 +11712,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E61010000002", - "lay %r1, 524287(%r2) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8294,12 +11727,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61230000002", - "vlebrg %f1, 0(%r2,%r3), 0", + "vlebrg %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8307,12 +11742,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF0002", - "vlebrg %f1, 4095(%r2,%r3), 0", + "vlebrg %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8320,12 +11757,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E61010000002", - "lay %r1, -524288(%r2,%r3) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8333,12 +11772,74 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E61010000002", - "lay %r1, 524287(%r2,%r3) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vlebrg %v1, 0(%r1), 0", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E61020007009", + "vstebrh %v1, 0(%r2), 7", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0009", + "vstebrh %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E61230007009", + "vstebrh %v1, 0(%r2,%r3), 7", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0009", + "vstebrh %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8346,12 +11847,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102000000B", - "vstebrf %f1, 0(%r2), 0", + "vstebrf %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8359,12 +11862,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF000B", - "vstebrf %f1, 4095(%r2), 0", + "vstebrf %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8372,12 +11877,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E6101000000B", - "lay %r1, -524288(%r2) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8385,12 +11892,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E6101000000B", - "lay %r1, 524287(%r2) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8398,12 +11907,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123000000B", - "vstebrf %f1, 0(%r2,%r3), 0", + "vstebrf %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8411,12 +11922,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF000B", - "vstebrf %f1, 4095(%r2,%r3), 0", + "vstebrf %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8424,12 +11937,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E6101000000B", - "lay %r1, -524288(%r2,%r3) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8437,12 +11952,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E6101000000B", - "lay %r1, 524287(%r2,%r3) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8450,12 +11967,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102000000A", - "vstebrg %f1, 0(%r2), 0", + "vstebrg %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8463,12 +11982,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF000A", - "vstebrg %f1, 4095(%r2), 0", + "vstebrg %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8476,12 +11997,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E6101000000A", - "lay %r1, -524288(%r2) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8489,12 +12012,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E6101000000A", - "lay %r1, 524287(%r2) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8502,12 +12027,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123000000A", - "vstebrg %f1, 0(%r2,%r3), 0", + "vstebrg %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8515,12 +12042,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF000A", - "vstebrg %f1, 4095(%r2,%r3), 0", + "vstebrg %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8528,12 +12057,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E6101000000A", - "lay %r1, -524288(%r2,%r3) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8541,304 +12072,476 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E6101000000A", - "lay %r1, 524287(%r2,%r3) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::LoadFpuConst32 { + Inst::VecInsertLane { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400000022", + "vlvgb %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF0022", + "vlvgb %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 8, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430000822", + "vlvgb %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400001022", + "vlvgh %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF1022", + "vlvgh %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430001822", + "vlvgh %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400002022", + "vlvgf %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF2022", + "vlvgf %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430002822", + "vlvgf %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 64, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400003022", + "vlvgg %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 64, rd: writable_vr(8), - const_data: 1.0_f32.to_bits(), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "A71500043F80000078801000", - "bras %r1, 8 ; data.f32 1 ; le %f8, 0(%r1)", + "E78400FF3022", + "vlvgg %v8, %r4, 255", )); insns.push(( - Inst::LoadFpuConst32 { + Inst::VecInsertLane { + size: 64, rd: writable_vr(24), - const_data: 1.0_f32.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "A71500043F800000E78010000803", - "bras %r1, 8 ; data.f32 1 ; vlef %v24, 0(%r1), 0", + "E78430003822", + "vlvgg %v24, %r4, 0(%r3)", )); insns.push(( - Inst::LoadFpuConst64 { + Inst::VecInsertLaneUndef { + size: 8, rd: writable_vr(8), - const_data: 1.0_f64.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "A71500063FF000000000000068801000", - "bras %r1, 12 ; data.f64 1 ; ld %f8, 0(%r1)", + "E78400000022", + "vlvgb %v8, %r4, 0", )); insns.push(( - Inst::LoadFpuConst64 { + Inst::VecInsertLaneUndef { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF0022", + "vlvgb %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLaneUndef { + size: 8, rd: writable_vr(24), - const_data: 1.0_f64.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "A71500063FF0000000000000E78010000802", - "bras %r1, 12 ; data.f64 1 ; vleg %v24, 0(%r1), 0", + "E78430000822", + "vlvgb %v24, %r4, 0(%r3)", )); - insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Cvt64To32, - mode: FpuRoundMode::Current, + Inst::VecInsertLaneUndef { + size: 16, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B344008C", - "ledbra %f8, %f12, 0", + "E78400001022", + "vlvgh %v8, %r4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Cvt64To32, - mode: FpuRoundMode::ToNearest, + Inst::VecInsertLaneUndef { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF1022", + "vlvgh %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLaneUndef { + size: 16, rd: writable_vr(24), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001838C5", - "wledb %v24, %f12, 0, 1", + "E78430001822", + "vlvgh %v24, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNegInfinity, + Inst::VecInsertLaneUndef { + size: 32, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B357708C", - "fiebr %f8, %f12, 7", + "E78400002022", + "vlvgf %v8, %r4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNegInfinity, + Inst::VecInsertLaneUndef { + size: 32, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B35F708C", - "fidbr %f8, %f12, 7", + "E78400FF2022", + "vlvgf %v8, %r4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToPosInfinity, - rd: writable_vr(8), - rn: vr(12), + Inst::VecInsertLaneUndef { + size: 32, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "B357608C", - "fiebr %f8, %f12, 6", + "E78430002822", + "vlvgf %v24, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToPosInfinity, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B35F608C", - "fidbr %f8, %f12, 6", + "B3C10084", + "ldgr %f8, %r4", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToZero, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B357508C", - "fiebr %f8, %f12, 5", + "E78400FF3022", + "vlvgg %v8, %r4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToZero, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "B35F508C", - "fidbr %f8, %f12, 5", + "E78430003022", + "vlvgg %v8, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNearestTiesToEven, - rd: writable_vr(8), - rn: vr(12), + Inst::VecExtractLane { + size: 8, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B357408C", - "fiebr %f8, %f12, 4", + "E78400FF0021", + "vlgvb %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNearestTiesToEven, - rd: writable_vr(8), - rn: vr(12), + Inst::VecExtractLane { + size: 8, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "B35F408C", - "fidbr %f8, %f12, 4", + "E78430000421", + "vlgvb %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001828C7", - "wfisb %v24, %f12, 0, 1", + "E78400001021", + "vlgvh %r8, %v4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001838C7", - "wfidb %v24, %f12, 0, 1", + "E78400FF1021", + "vlgvh %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToSInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001828C2", - "wcfeb %v24, %f12, 0, 1", + "E78430001421", + "vlgvh %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToSInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001838C2", - "wcgdb %v24, %f12, 0, 1", + "E78400002021", + "vlgvf %r8, %v4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToUInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001828C0", - "wclfeb %v24, %f12, 0, 1", + "E78400FF2021", + "vlgvf %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToUInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001838C0", - "wclgdb %v24, %f12, 0, 1", + "E78430002421", + "vlgvf %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromSInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001828C3", - "wcefb %v24, %f12, 0, 1", + "B3CD0084", + "lgdr %r8, %f4", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromSInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001838C3", - "wcdgb %v24, %f12, 0, 1", + "E78400FF3021", + "vlgvg %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromUInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001828C1", - "wcelfb %v24, %f12, 0, 1", + "E78430003021", + "vlgvg %r8, %v4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromUInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecInsertLaneImm { + size: 8, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 15, }, - "E78C001838C1", - "wcdlgb %v24, %f12, 0, 1", + "E7401234F840", + "vleib %v20, 4660, 15", )); - insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(8), - ra: vr(10), + Inst::VecInsertLaneImm { + size: 16, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 7, }, - "E7468000A08D", - "vsel %v4, %v6, %v8, %v10", + "E74012347841", + "vleih %v20, 4660, 7", )); insns.push(( - Inst::VecSelect { + Inst::VecInsertLaneImm { + size: 32, rd: writable_vr(20), - rn: vr(6), - rm: vr(8), - ra: vr(10), + imm: 0x1234, + lane_imm: 3, }, - "E7468000A88D", - "vsel %v20, %v6, %v8, %v10", + "E74012343843", + "vleif %v20, 4660, 3", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(22), - rm: vr(8), - ra: vr(10), + Inst::VecInsertLaneImm { + size: 64, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 1, }, - "E7468000A48D", - "vsel %v4, %v22, %v8, %v10", + "E74012341842", + "vleig %v20, 4660, 1", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(24), - ra: vr(10), + Inst::VecReplicateLane { + size: 8, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 15, }, - "E7468000A28D", - "vsel %v4, %v6, %v24, %v10", + "E748000F084D", + "vrepb %v20, %v8, 15", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(8), - ra: vr(26), + Inst::VecReplicateLane { + size: 16, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 7, }, - "E7468000A18D", - "vsel %v4, %v6, %v8, %v26", + "E7480007184D", + "vreph %v20, %v8, 7", )); insns.push(( - Inst::VecSelect { + Inst::VecReplicateLane { + size: 32, rd: writable_vr(20), - rn: vr(22), - rm: vr(24), - ra: vr(26), + rn: vr(8), + lane_imm: 3, }, - "E7468000AF8D", - "vsel %v20, %v22, %v24, %v26", + "E7480003284D", + "vrepf %v20, %v8, 3", + )); + insns.push(( + Inst::VecReplicateLane { + size: 64, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 1, + }, + "E7480001384D", + "vrepg %v20, %v8, 1", )); let flags = settings::Flags::new(settings::builder()); diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index 676e0d4794f4..ad5af092bc6b 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -2,6 +2,7 @@ use crate::binemit::{Addend, CodeOffset, Reloc}; use crate::ir::{types, ExternalName, Opcode, Type}; +use crate::isa::CallConv; use crate::machinst::*; use crate::{settings, CodegenError, CodegenResult}; use alloc::boxed::Box; @@ -28,7 +29,7 @@ mod emit_tests; pub use crate::isa::s390x::lower::isle::generated_code::{ ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp, - ShiftOp, UnaryOp, + ShiftOp, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp, }; /// Additional information for (direct) Call instructions, left out of line to lower the size of @@ -40,6 +41,8 @@ pub struct CallInfo { pub defs: SmallVec<[Writable; 8]>, pub clobbers: PRegSet, pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, } /// Additional information for CallInd instructions, left out of line to lower the size of the Inst @@ -51,6 +54,8 @@ pub struct CallIndInfo { pub defs: SmallVec<[Writable; 8]>, pub clobbers: PRegSet, pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, } #[test] @@ -156,22 +161,42 @@ impl Inst { | Inst::FpuMove64 { .. } | Inst::FpuCMov32 { .. } | Inst::FpuCMov64 { .. } - | Inst::MovToFpr32 { .. } - | Inst::MovToFpr64 { .. } - | Inst::MovFromFpr32 { .. } - | Inst::MovFromFpr64 { .. } | Inst::FpuRR { .. } | Inst::FpuRRR { .. } | Inst::FpuRRRR { .. } | Inst::FpuCmp32 { .. } | Inst::FpuCmp64 { .. } - | Inst::FpuLoad32 { .. } - | Inst::FpuStore32 { .. } - | Inst::FpuLoad64 { .. } - | Inst::FpuStore64 { .. } | Inst::LoadFpuConst32 { .. } | Inst::LoadFpuConst64 { .. } + | Inst::VecRRR { .. } + | Inst::VecRR { .. } + | Inst::VecShiftRR { .. } | Inst::VecSelect { .. } + | Inst::VecPermute { .. } + | Inst::VecPermuteDWImm { .. } + | Inst::VecIntCmp { .. } + | Inst::VecIntCmpS { .. } + | Inst::VecFloatCmp { .. } + | Inst::VecFloatCmpS { .. } + | Inst::VecLoad { .. } + | Inst::VecStore { .. } + | Inst::VecLoadReplicate { .. } + | Inst::VecMov { .. } + | Inst::VecCMov { .. } + | Inst::MovToVec128 { .. } + | Inst::VecLoadConst { .. } + | Inst::VecLoadConstReplicate { .. } + | Inst::VecImmByteMask { .. } + | Inst::VecImmBitMask { .. } + | Inst::VecImmReplicate { .. } + | Inst::VecLoadLane { .. } + | Inst::VecLoadLaneUndef { .. } + | Inst::VecStoreLane { .. } + | Inst::VecInsertLane { .. } + | Inst::VecInsertLaneUndef { .. } + | Inst::VecExtractLane { .. } + | Inst::VecInsertLaneImm { .. } + | Inst::VecReplicateLane { .. } | Inst::Call { .. } | Inst::CallInd { .. } | Inst::Ret { .. } @@ -207,19 +232,33 @@ impl Inst { Inst::FpuRound { op, .. } => match op { FpuRoundOp::ToSInt32 | FpuRoundOp::FromSInt32 => InstructionSet::MIE2, FpuRoundOp::ToUInt32 | FpuRoundOp::FromUInt32 => InstructionSet::MIE2, + FpuRoundOp::ToSInt32x4 | FpuRoundOp::FromSInt32x4 => InstructionSet::MIE2, + FpuRoundOp::ToUInt32x4 | FpuRoundOp::FromUInt32x4 => InstructionSet::MIE2, _ => InstructionSet::Base, }, // These are all part of VXRS_EXT2 - Inst::FpuLoadRev32 { .. } - | Inst::FpuStoreRev32 { .. } - | Inst::FpuLoadRev64 { .. } - | Inst::FpuStoreRev64 { .. } => InstructionSet::VXRS_EXT2, + Inst::VecLoadRev { .. } + | Inst::VecStoreRev { .. } + | Inst::VecLoadReplicateRev { .. } + | Inst::VecLoadLaneRev { .. } + | Inst::VecLoadLaneRevUndef { .. } + | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2, Inst::DummyUse { .. } => InstructionSet::Base, } } + /// Create a 128-bit move instruction. + pub fn mov128(to_reg: Writable, from_reg: Reg) -> Inst { + assert!(to_reg.to_reg().class() == RegClass::Float); + assert!(from_reg.class() == RegClass::Float); + Inst::VecMov { + rd: to_reg, + rn: from_reg, + } + } + /// Create a 64-bit move instruction. pub fn mov64(to_reg: Writable, from_reg: Reg) -> Inst { assert!(to_reg.to_reg().class() == from_reg.class()); @@ -323,6 +362,17 @@ impl Inst { } } + /// Create an instruction that loads a 128-bit floating-point constant. + pub fn load_vec_constant(rd: Writable, value: u128) -> Inst { + // FIXME: This doesn't special-case constants that can be loaded + // without a constant pool, like the ISLE lowering does. Ideally, + // we should not have to duplicate the logic here. + Inst::VecLoadConst { + rd, + const_data: value, + } + } + /// Generic constructor for a load (zero-extending where appropriate). pub fn gen_load(into_reg: Writable, mem: MemArg, ty: Type) -> Inst { match ty { @@ -330,8 +380,19 @@ impl Inst { types::B16 | types::I16 => Inst::Load64ZExt16 { rd: into_reg, mem }, types::B32 | types::I32 => Inst::Load64ZExt32 { rd: into_reg, mem }, types::B64 | types::I64 | types::R64 => Inst::Load64 { rd: into_reg, mem }, - types::F32 => Inst::FpuLoad32 { rd: into_reg, mem }, - types::F64 => Inst::FpuLoad64 { rd: into_reg, mem }, + types::F32 => Inst::VecLoadLaneUndef { + size: 32, + rd: into_reg, + mem, + lane_imm: 0, + }, + types::F64 => Inst::VecLoadLaneUndef { + size: 64, + rd: into_reg, + mem, + lane_imm: 0, + }, + _ if ty.is_vector() && ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem }, _ => unimplemented!("gen_load({})", ty), } } @@ -343,8 +404,19 @@ impl Inst { types::B16 | types::I16 => Inst::Store16 { rd: from_reg, mem }, types::B32 | types::I32 => Inst::Store32 { rd: from_reg, mem }, types::B64 | types::I64 | types::R64 => Inst::Store64 { rd: from_reg, mem }, - types::F32 => Inst::FpuStore32 { rd: from_reg, mem }, - types::F64 => Inst::FpuStore64 { rd: from_reg, mem }, + types::F32 => Inst::VecStoreLane { + size: 32, + rd: from_reg, + mem, + lane_imm: 0, + }, + types::F64 => Inst::VecStoreLane { + size: 64, + rd: from_reg, + mem, + lane_imm: 0, + }, + _ if ty.is_vector() && ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem }, _ => unimplemented!("gen_store({})", ty), } } @@ -365,6 +437,9 @@ fn memarg_operands VReg>(memarg: &MemArg, collector: &mut Operand } &MemArg::InitialSPOffset { .. } | &MemArg::NominalSPOffset { .. } => {} } + // mem_finalize might require %r1 to hold (part of) the address. + // Conservatively assume this will always be necessary here. + collector.reg_early_def(writable_gpr(1)); } fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) { @@ -579,13 +654,6 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_mod(rd); collector.reg_use(rm); } - &Inst::MovToFpr32 { rd, rn } - | &Inst::MovToFpr64 { rd, rn } - | &Inst::MovFromFpr32 { rd, rn } - | &Inst::MovFromFpr64 { rd, rn } => { - collector.reg_def(rd); - collector.reg_use(rn); - } &Inst::FpuRR { rd, rn, .. } => { collector.reg_def(rd); collector.reg_use(rn); @@ -605,50 +673,158 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_use(rn); collector.reg_use(rm); } - &Inst::FpuLoad32 { rd, ref mem, .. } => { + &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); + } + &Inst::FpuRound { rd, rn, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecRRR { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecRR { rd, rn, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecShiftRR { + rd, rn, shift_reg, .. + } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(shift_reg); + } + &Inst::VecSelect { rd, rn, rm, ra, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + collector.reg_use(ra); + } + &Inst::VecPermute { rd, rn, rm, ra, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + collector.reg_use(ra); + } + &Inst::VecPermuteDWImm { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecIntCmp { rd, rn, rm, .. } | &Inst::VecIntCmpS { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecFloatCmp { rd, rn, rm, .. } | &Inst::VecFloatCmpS { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecLoad { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuLoad64 { rd, ref mem, .. } => { + &Inst::VecLoadRev { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuStore32 { rd, ref mem, .. } => { + &Inst::VecStore { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuStore64 { rd, ref mem, .. } => { + &Inst::VecStoreRev { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuLoadRev32 { rd, ref mem, .. } => { + &Inst::VecLoadReplicate { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadReplicateRev { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuLoadRev64 { rd, ref mem, .. } => { + &Inst::VecMov { rd, rn } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecCMov { rd, rm, .. } => { + collector.reg_mod(rd); + collector.reg_use(rm); + } + &Inst::MovToVec128 { rd, rn, rm } => { collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecLoadConst { rd, .. } | &Inst::VecLoadConstReplicate { rd, .. } => { + collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); + } + &Inst::VecImmByteMask { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecImmBitMask { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecImmReplicate { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecLoadLane { rd, ref mem, .. } => { + collector.reg_mod(rd); memarg_operands(mem, collector); } - &Inst::FpuStoreRev32 { rd, ref mem, .. } => { + &Inst::VecLoadLaneUndef { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreLaneRev { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuStoreRev64 { rd, ref mem, .. } => { + &Inst::VecLoadLaneRevUndef { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreLane { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + &Inst::VecLoadLaneRev { rd, ref mem, .. } => { + collector.reg_mod(rd); + memarg_operands(mem, collector); + } + &Inst::VecInsertLane { + rd, rn, lane_reg, .. + } => { + collector.reg_mod(rd); + collector.reg_use(rn); + collector.reg_use(lane_reg); + } + &Inst::VecInsertLaneUndef { + rd, rn, lane_reg, .. + } => { collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(lane_reg); } - &Inst::FpuRound { rd, rn, .. } => { + &Inst::VecExtractLane { + rd, rn, lane_reg, .. + } => { collector.reg_def(rd); collector.reg_use(rn); + collector.reg_use(lane_reg); } - &Inst::VecSelect { rd, rn, rm, ra, .. } => { + &Inst::VecInsertLaneImm { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecReplicateLane { rd, rn, .. } => { collector.reg_def(rd); collector.reg_use(rn); - collector.reg_use(rm); - collector.reg_use(ra); } &Inst::Extend { rd, rn, .. } => { collector.reg_def(rd); @@ -682,9 +858,11 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC &Inst::TrapIf { .. } => {} &Inst::JTSequence { ridx, .. } => { collector.reg_use(ridx); + collector.reg_early_def(writable_gpr(1)); } &Inst::LoadExtNameFar { rd, .. } => { collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); } &Inst::LoadAddr { rd, ref mem } => { collector.reg_def(rd); @@ -720,6 +898,7 @@ impl MachInst for Inst { &Inst::Mov64 { rd, rm } => Some((rd, rm)), &Inst::FpuMove32 { rd, rn } => Some((rd, rn)), &Inst::FpuMove64 { rd, rn } => Some((rd, rn)), + &Inst::VecMov { rd, rn } => Some((rd, rn)), _ => None, } } @@ -732,6 +911,21 @@ impl MachInst for Inst { } } + fn is_included_in_clobbers(&self) -> bool { + // We exclude call instructions from the clobber-set when they are calls + // from caller to callee with the same ABI. Such calls cannot possibly + // force any new registers to be saved in the prologue, because anything + // that the callee clobbers, the caller is also allowed to clobber. This + // both saves work and enables us to more precisely follow the + // half-caller-save, half-callee-save SysV ABI for some vector + // registers. + match self { + &Inst::Call { ref info, .. } => info.caller_callconv != info.callee_callconv, + &Inst::CallInd { ref info, .. } => info.caller_callconv != info.callee_callconv, + _ => true, + } + } + fn is_term(&self) -> MachTerminator { match self { &Inst::Ret { .. } | &Inst::EpiloguePlaceholder => MachTerminator::Ret, @@ -761,11 +955,13 @@ impl MachInst for Inst { } fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { - assert!(ty.bits() <= 64); + assert!(ty.bits() <= 128); if ty.bits() <= 32 { Inst::mov32(to_reg, from_reg) - } else { + } else if ty.bits() <= 64 { Inst::mov64(to_reg, from_reg) + } else { + Inst::mov128(to_reg, from_reg) } } @@ -778,11 +974,18 @@ impl MachInst for Inst { let to_reg = to_regs .only_reg() .expect("multi-reg values not supported yet"); - let value = value as u64; match ty { + _ if ty.is_vector() && ty.bits() == 128 => { + let mut ret = SmallVec::new(); + ret.push(Inst::load_vec_constant(to_reg, value)); + ret + } types::F64 => { let mut ret = SmallVec::new(); - ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value))); + ret.push(Inst::load_fp_constant64( + to_reg, + f64::from_bits(value as u64), + )); ret } types::F32 => { @@ -793,7 +996,7 @@ impl MachInst for Inst { )); ret } - types::I64 | types::B64 | types::R64 => Inst::load_constant64(to_reg, value), + types::I64 | types::B64 | types::R64 => Inst::load_constant64(to_reg, value as u64), types::B1 | types::I8 | types::B8 @@ -832,6 +1035,7 @@ impl MachInst for Inst { types::F64 => Ok((&[RegClass::Float], &[types::F64])), types::I128 => Ok((&[RegClass::Int, RegClass::Int], &[types::I64, types::I64])), types::B128 => Ok((&[RegClass::Int, RegClass::Int], &[types::B64, types::B64])), + _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])), // FIXME: We don't really have IFLAGS, but need to allow it here // for now to support the SelectifSpectreGuard instruction. types::IFLAGS => Ok((&[RegClass::Int], &[types::I64])), @@ -845,7 +1049,7 @@ impl MachInst for Inst { fn canonical_type_for_rc(rc: RegClass) -> Type { match rc { RegClass::Int => types::I64, - RegClass::Float => types::F64, + RegClass::Float => types::I8X16, } } @@ -1497,43 +1701,6 @@ impl Inst { let mem = mem.pretty_print_default(); format!("{}{} {}, {}", mem_str, op.unwrap(), rd, mem) } - &Inst::FpuLoad32 { rd, ref mem } - | &Inst::FpuLoad64 { rd, ref mem } - | &Inst::FpuLoadRev32 { rd, ref mem } - | &Inst::FpuLoadRev64 { rd, ref mem } => { - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuLoad32 { .. } => (Some("le"), Some("ley"), "vlef"), - &Inst::FpuLoad64 { .. } => (Some("ld"), Some("ldy"), "vleg"), - &Inst::FpuLoadRev32 { .. } => (None, None, "vlebrf"), - &Inst::FpuLoadRev64 { .. } => (None, None, "vlebrg"), - _ => unreachable!(), - }; - - let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); - let mem = mem.with_allocs(allocs); - if rd_fpr.is_some() && opcode_rx.is_some() { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, true, false, true); - let op = match &mem { - &MemArg::BXD12 { .. } => opcode_rx, - &MemArg::BXD20 { .. } => opcode_rxy, - _ => unreachable!(), - }; - let mem = mem.pretty_print_default(); - format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) - } else { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, false, false, true); - let mem = mem.pretty_print_default(); - format!( - "{}{} {}, {}, 0", - mem_str, - opcode_vrx, - rd_fpr.unwrap_or(rd), - mem - ) - } - } &Inst::Store8 { rd, ref mem } | &Inst::Store16 { rd, ref mem } | &Inst::Store32 { rd, ref mem } @@ -1599,43 +1766,6 @@ impl Inst { format!("{}{} {}, {}", mem_str, op, mem, imm) } - &Inst::FpuStore32 { rd, ref mem } - | &Inst::FpuStore64 { rd, ref mem } - | &Inst::FpuStoreRev32 { rd, ref mem } - | &Inst::FpuStoreRev64 { rd, ref mem } => { - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuStore32 { .. } => (Some("ste"), Some("stey"), "vstef"), - &Inst::FpuStore64 { .. } => (Some("std"), Some("stdy"), "vsteg"), - &Inst::FpuStoreRev32 { .. } => (None, None, "vstebrf"), - &Inst::FpuStoreRev64 { .. } => (None, None, "vstebrg"), - _ => unreachable!(), - }; - - let (rd, rd_fpr) = pretty_print_fpr(rd, allocs); - let mem = mem.with_allocs(allocs); - if rd_fpr.is_some() && opcode_rx.is_some() { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, true, false, true); - let op = match &mem { - &MemArg::BXD12 { .. } => opcode_rx, - &MemArg::BXD20 { .. } => opcode_rxy, - _ => unreachable!(), - }; - let mem = mem.pretty_print_default(); - format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) - } else { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, false, false, true); - let mem = mem.pretty_print_default(); - format!( - "{}{} {}, {}, 0", - mem_str, - opcode_vrx, - rd_fpr.unwrap_or(rd), - mem - ) - } - } &Inst::LoadMultiple64 { rt, rt2, ref mem } => { let mem = mem.with_allocs(allocs); let (mem_str, mem) = mem_finalize_for_show(&mem, state, false, true, false, false); @@ -1780,69 +1910,77 @@ impl Inst { format!("j{} 10 ; vlr {}, {}", cond, rd, rm) } } - &Inst::MovToFpr32 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - format!("vlvgf {}, {}, 0", rd, rn) - } - &Inst::MovToFpr64 { rd, rn } => { - let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - if rd_fpr.is_some() { - format!("ldgr {}, {}", rd_fpr.unwrap(), rn) - } else { - format!("vlvgg {}, {}, 0", rd, rn) - } - } - &Inst::MovFromFpr32 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - format!("vlgvf {}, {}, 0", rd, rn) - } - &Inst::MovFromFpr64 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); - if rn_fpr.is_some() { - format!("lgdr {}, {}", rd, rn_fpr.unwrap()) - } else { - format!("vlgvg {}, {}, 0", rd, rn) - } - } &Inst::FpuRR { fpu_op, rd, rn } => { let (op, op_fpr) = match fpu_op { - FPUOp1::Abs32 => ("wflpsb", "lpebr"), - FPUOp1::Abs64 => ("wflpdb", "lpdbr"), - FPUOp1::Neg32 => ("wflcsb", "lcebr"), - FPUOp1::Neg64 => ("wflcdb", "lcdbr"), - FPUOp1::NegAbs32 => ("wflnsb", "lnebr"), - FPUOp1::NegAbs64 => ("wflndb", "lndbr"), - FPUOp1::Sqrt32 => ("wfsqsb", "sqebr"), - FPUOp1::Sqrt64 => ("wfsqdb", "sqdbr"), - FPUOp1::Cvt32To64 => ("wldeb", "ldebr"), + FPUOp1::Abs32 => ("wflpsb", Some("lpebr")), + FPUOp1::Abs64 => ("wflpdb", Some("lpdbr")), + FPUOp1::Abs32x4 => ("vflpsb", None), + FPUOp1::Abs64x2 => ("vflpdb", None), + FPUOp1::Neg32 => ("wflcsb", Some("lcebr")), + FPUOp1::Neg64 => ("wflcdb", Some("lcdbr")), + FPUOp1::Neg32x4 => ("vflcsb", None), + FPUOp1::Neg64x2 => ("vflcdb", None), + FPUOp1::NegAbs32 => ("wflnsb", Some("lnebr")), + FPUOp1::NegAbs64 => ("wflndb", Some("lndbr")), + FPUOp1::NegAbs32x4 => ("vflnsb", None), + FPUOp1::NegAbs64x2 => ("vflndb", None), + FPUOp1::Sqrt32 => ("wfsqsb", Some("sqebr")), + FPUOp1::Sqrt64 => ("wfsqdb", Some("sqdbr")), + FPUOp1::Sqrt32x4 => ("vfsqsb", None), + FPUOp1::Sqrt64x2 => ("vfsqdb", None), + FPUOp1::Cvt32To64 => ("wldeb", Some("ldebr")), + FPUOp1::Cvt32x4To64x2 => ("vldeb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); - if rd_fpr.is_some() && rn_fpr.is_some() { - format!("{} {}, {}", op_fpr, rd_fpr.unwrap(), rn_fpr.unwrap()) - } else { + if op_fpr.is_some() && rd_fpr.is_some() && rn_fpr.is_some() { + format!( + "{} {}, {}", + op_fpr.unwrap(), + rd_fpr.unwrap(), + rn_fpr.unwrap() + ) + } else if op.starts_with('w') { format!("{} {}, {}", op, rd_fpr.unwrap_or(rd), rn_fpr.unwrap_or(rn)) + } else { + format!("{} {}, {}", op, rd, rn) } } &Inst::FpuRRR { fpu_op, rd, rn, rm } => { let (op, opt_m6, op_fpr) = match fpu_op { FPUOp2::Add32 => ("wfasb", "", Some("aebr")), FPUOp2::Add64 => ("wfadb", "", Some("adbr")), + FPUOp2::Add32x4 => ("vfasb", "", None), + FPUOp2::Add64x2 => ("vfadb", "", None), FPUOp2::Sub32 => ("wfssb", "", Some("sebr")), FPUOp2::Sub64 => ("wfsdb", "", Some("sdbr")), + FPUOp2::Sub32x4 => ("vfssb", "", None), + FPUOp2::Sub64x2 => ("vfsdb", "", None), FPUOp2::Mul32 => ("wfmsb", "", Some("meebr")), FPUOp2::Mul64 => ("wfmdb", "", Some("mdbr")), + FPUOp2::Mul32x4 => ("vfmsb", "", None), + FPUOp2::Mul64x2 => ("vfmdb", "", None), FPUOp2::Div32 => ("wfdsb", "", Some("debr")), FPUOp2::Div64 => ("wfddb", "", Some("ddbr")), + FPUOp2::Div32x4 => ("vfdsb", "", None), + FPUOp2::Div64x2 => ("vfddb", "", None), FPUOp2::Max32 => ("wfmaxsb", ", 1", None), FPUOp2::Max64 => ("wfmaxdb", ", 1", None), + FPUOp2::Max32x4 => ("vfmaxsb", ", 1", None), + FPUOp2::Max64x2 => ("vfmaxdb", ", 1", None), FPUOp2::Min32 => ("wfminsb", ", 1", None), FPUOp2::Min64 => ("wfmindb", ", 1", None), + FPUOp2::Min32x4 => ("vfminsb", ", 1", None), + FPUOp2::Min64x2 => ("vfmindb", ", 1", None), + FPUOp2::MaxPseudo32 => ("wfmaxsb", ", 3", None), + FPUOp2::MaxPseudo64 => ("wfmaxdb", ", 3", None), + FPUOp2::MaxPseudo32x4 => ("vfmaxsb", ", 3", None), + FPUOp2::MaxPseudo64x2 => ("vfmaxdb", ", 3", None), + FPUOp2::MinPseudo32 => ("wfminsb", ", 3", None), + FPUOp2::MinPseudo64 => ("wfmindb", ", 3", None), + FPUOp2::MinPseudo32x4 => ("vfminsb", ", 3", None), + FPUOp2::MinPseudo64x2 => ("vfmindb", ", 3", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); @@ -1855,7 +1993,7 @@ impl Inst { rd_fpr.unwrap(), rm_fpr.unwrap() ) - } else { + } else if op.starts_with('w') { format!( "{} {}, {}, {}{}", op, @@ -1864,6 +2002,8 @@ impl Inst { rm_fpr.unwrap_or(rm), opt_m6 ) + } else { + format!("{} {}, {}, {}{}", op, rd, rn, rm, opt_m6) } } &Inst::FpuRRRR { @@ -1874,25 +2014,34 @@ impl Inst { ra, } => { let (op, op_fpr) = match fpu_op { - FPUOp3::MAdd32 => ("wfmasb", "maebr"), - FPUOp3::MAdd64 => ("wfmadb", "madbr"), - FPUOp3::MSub32 => ("wfmssb", "msebr"), - FPUOp3::MSub64 => ("wfmsdb", "msdbr"), + FPUOp3::MAdd32 => ("wfmasb", Some("maebr")), + FPUOp3::MAdd64 => ("wfmadb", Some("madbr")), + FPUOp3::MAdd32x4 => ("vfmasb", None), + FPUOp3::MAdd64x2 => ("vfmadb", None), + FPUOp3::MSub32 => ("wfmssb", Some("msebr")), + FPUOp3::MSub64 => ("wfmsdb", Some("msdbr")), + FPUOp3::MSub32x4 => ("vfmssb", None), + FPUOp3::MSub64x2 => ("vfmsdb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); let (rm, rm_fpr) = pretty_print_fpr(rm, allocs); let (ra, ra_fpr) = pretty_print_fpr(ra, allocs); - if rd == ra && rd_fpr.is_some() && rn_fpr.is_some() && rm_fpr.is_some() { + if op_fpr.is_some() + && rd == ra + && rd_fpr.is_some() + && rn_fpr.is_some() + && rm_fpr.is_some() + { format!( "{} {}, {}, {}", - op_fpr, + op_fpr.unwrap(), rd_fpr.unwrap(), rn_fpr.unwrap(), rm_fpr.unwrap() ) - } else { + } else if op.starts_with('w') { format!( "{} {}, {}, {}, {}", op, @@ -1901,6 +2050,8 @@ impl Inst { rm_fpr.unwrap_or(rm), ra_fpr.unwrap_or(ra) ) + } else { + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) } } &Inst::FpuCmp32 { rn, rm } => { @@ -1975,16 +2126,27 @@ impl Inst { }; let (opcode, opcode_fpr) = match op { FpuRoundOp::Cvt64To32 => ("wledb", Some("ledbra")), + FpuRoundOp::Cvt64x2To32x4 => ("vledb", None), FpuRoundOp::Round32 => ("wfisb", Some("fiebr")), FpuRoundOp::Round64 => ("wfidb", Some("fidbr")), + FpuRoundOp::Round32x4 => ("vfisb", None), + FpuRoundOp::Round64x2 => ("vfidb", None), FpuRoundOp::ToSInt32 => ("wcfeb", None), FpuRoundOp::ToSInt64 => ("wcgdb", None), FpuRoundOp::ToUInt32 => ("wclfeb", None), FpuRoundOp::ToUInt64 => ("wclgdb", None), + FpuRoundOp::ToSInt32x4 => ("vcfeb", None), + FpuRoundOp::ToSInt64x2 => ("vcgdb", None), + FpuRoundOp::ToUInt32x4 => ("vclfeb", None), + FpuRoundOp::ToUInt64x2 => ("vclgdb", None), FpuRoundOp::FromSInt32 => ("wcefb", None), FpuRoundOp::FromSInt64 => ("wcdgb", None), FpuRoundOp::FromUInt32 => ("wcelfb", None), FpuRoundOp::FromUInt64 => ("wcdlgb", None), + FpuRoundOp::FromSInt32x4 => ("vcefb", None), + FpuRoundOp::FromSInt64x2 => ("vcdgb", None), + FpuRoundOp::FromUInt32x4 => ("vcelfb", None), + FpuRoundOp::FromUInt64x2 => ("vcdlgb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); @@ -1997,7 +2159,7 @@ impl Inst { rn_fpr.unwrap(), mode ) - } else { + } else if opcode.starts_with('w') { format!( "{} {}, {}, 0, {}", opcode, @@ -2005,8 +2167,168 @@ impl Inst { rn_fpr.unwrap_or(rn), mode ) + } else { + format!("{} {}, {}, 0, {}", opcode, rd, rn, mode) } } + &Inst::VecRRR { op, rd, rn, rm } => { + let op = match op { + VecBinaryOp::Add8x16 => "vab", + VecBinaryOp::Add16x8 => "vah", + VecBinaryOp::Add32x4 => "vaf", + VecBinaryOp::Add64x2 => "vag", + VecBinaryOp::Sub8x16 => "vsb", + VecBinaryOp::Sub16x8 => "vsh", + VecBinaryOp::Sub32x4 => "vsf", + VecBinaryOp::Sub64x2 => "vsg", + VecBinaryOp::Mul8x16 => "vmlb", + VecBinaryOp::Mul16x8 => "vmlhw", + VecBinaryOp::Mul32x4 => "vmlf", + VecBinaryOp::UMulHi8x16 => "vmlhb", + VecBinaryOp::UMulHi16x8 => "vmlhh", + VecBinaryOp::UMulHi32x4 => "vmlhf", + VecBinaryOp::SMulHi8x16 => "vmhb", + VecBinaryOp::SMulHi16x8 => "vmhh", + VecBinaryOp::SMulHi32x4 => "vmhf", + VecBinaryOp::UMulEven8x16 => "vmleb", + VecBinaryOp::UMulEven16x8 => "vmleh", + VecBinaryOp::UMulEven32x4 => "vmlef", + VecBinaryOp::SMulEven8x16 => "vmeb", + VecBinaryOp::SMulEven16x8 => "vmeh", + VecBinaryOp::SMulEven32x4 => "vmef", + VecBinaryOp::UMulOdd8x16 => "vmlob", + VecBinaryOp::UMulOdd16x8 => "vmloh", + VecBinaryOp::UMulOdd32x4 => "vmlof", + VecBinaryOp::SMulOdd8x16 => "vmob", + VecBinaryOp::SMulOdd16x8 => "vmoh", + VecBinaryOp::SMulOdd32x4 => "vmof", + VecBinaryOp::UMax8x16 => "vmxlb", + VecBinaryOp::UMax16x8 => "vmxlh", + VecBinaryOp::UMax32x4 => "vmxlf", + VecBinaryOp::UMax64x2 => "vmxlg", + VecBinaryOp::SMax8x16 => "vmxb", + VecBinaryOp::SMax16x8 => "vmxh", + VecBinaryOp::SMax32x4 => "vmxf", + VecBinaryOp::SMax64x2 => "vmxg", + VecBinaryOp::UMin8x16 => "vmnlb", + VecBinaryOp::UMin16x8 => "vmnlh", + VecBinaryOp::UMin32x4 => "vmnlf", + VecBinaryOp::UMin64x2 => "vmnlg", + VecBinaryOp::SMin8x16 => "vmnb", + VecBinaryOp::SMin16x8 => "vmnh", + VecBinaryOp::SMin32x4 => "vmnf", + VecBinaryOp::SMin64x2 => "vmng", + VecBinaryOp::UAvg8x16 => "vavglb", + VecBinaryOp::UAvg16x8 => "vavglh", + VecBinaryOp::UAvg32x4 => "vavglf", + VecBinaryOp::UAvg64x2 => "vavglg", + VecBinaryOp::SAvg8x16 => "vavgb", + VecBinaryOp::SAvg16x8 => "vavgh", + VecBinaryOp::SAvg32x4 => "vavgf", + VecBinaryOp::SAvg64x2 => "vavgg", + VecBinaryOp::And128 => "vn", + VecBinaryOp::Orr128 => "vo", + VecBinaryOp::Xor128 => "vx", + VecBinaryOp::NotAnd128 => "vnn", + VecBinaryOp::NotOrr128 => "vno", + VecBinaryOp::NotXor128 => "vnx", + VecBinaryOp::AndNot128 => "vnc", + VecBinaryOp::OrrNot128 => "voc", + VecBinaryOp::BitPermute128 => "vbperm", + VecBinaryOp::LShLByByte128 => "vslb", + VecBinaryOp::LShRByByte128 => "vsrlb", + VecBinaryOp::AShRByByte128 => "vsrab", + VecBinaryOp::LShLByBit128 => "vsl", + VecBinaryOp::LShRByBit128 => "vsrl", + VecBinaryOp::AShRByBit128 => "vsra", + VecBinaryOp::Pack16x8 => "vpkh", + VecBinaryOp::Pack32x4 => "vpkf", + VecBinaryOp::Pack64x2 => "vpkg", + VecBinaryOp::PackUSat16x8 => "vpklsh", + VecBinaryOp::PackUSat32x4 => "vpklsf", + VecBinaryOp::PackUSat64x2 => "vpklsg", + VecBinaryOp::PackSSat16x8 => "vpksh", + VecBinaryOp::PackSSat32x4 => "vpksf", + VecBinaryOp::PackSSat64x2 => "vpksg", + VecBinaryOp::MergeLow8x16 => "vmrlb", + VecBinaryOp::MergeLow16x8 => "vmrlh", + VecBinaryOp::MergeLow32x4 => "vmrlf", + VecBinaryOp::MergeLow64x2 => "vmrlg", + VecBinaryOp::MergeHigh8x16 => "vmrhb", + VecBinaryOp::MergeHigh16x8 => "vmrhh", + VecBinaryOp::MergeHigh32x4 => "vmrhf", + VecBinaryOp::MergeHigh64x2 => "vmrhg", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::VecRR { op, rd, rn } => { + let op = match op { + VecUnaryOp::Abs8x16 => "vlpb", + VecUnaryOp::Abs16x8 => "vlph", + VecUnaryOp::Abs32x4 => "vlpf", + VecUnaryOp::Abs64x2 => "vlpg", + VecUnaryOp::Neg8x16 => "vlcb", + VecUnaryOp::Neg16x8 => "vlch", + VecUnaryOp::Neg32x4 => "vlcf", + VecUnaryOp::Neg64x2 => "vlcg", + VecUnaryOp::Popcnt8x16 => "vpopctb", + VecUnaryOp::Popcnt16x8 => "vpopcth", + VecUnaryOp::Popcnt32x4 => "vpopctf", + VecUnaryOp::Popcnt64x2 => "vpopctg", + VecUnaryOp::UnpackULow8x16 => "vupllb", + VecUnaryOp::UnpackULow16x8 => "vupllh", + VecUnaryOp::UnpackULow32x4 => "vupllf", + VecUnaryOp::UnpackUHigh8x16 => "vuplhb", + VecUnaryOp::UnpackUHigh16x8 => "vuplhh", + VecUnaryOp::UnpackUHigh32x4 => "vuplhf", + VecUnaryOp::UnpackSLow8x16 => "vuplb", + VecUnaryOp::UnpackSLow16x8 => "vuplh", + VecUnaryOp::UnpackSLow32x4 => "vuplf", + VecUnaryOp::UnpackSHigh8x16 => "vuphb", + VecUnaryOp::UnpackSHigh16x8 => "vuphh", + VecUnaryOp::UnpackSHigh32x4 => "vuphf", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("{} {}, {}", op, rd, rn) + } + &Inst::VecShiftRR { + shift_op, + rd, + rn, + shift_imm, + shift_reg, + } => { + let op = match shift_op { + VecShiftOp::RotL8x16 => "verllb", + VecShiftOp::RotL16x8 => "verllh", + VecShiftOp::RotL32x4 => "verllf", + VecShiftOp::RotL64x2 => "verllg", + VecShiftOp::LShL8x16 => "veslb", + VecShiftOp::LShL16x8 => "veslh", + VecShiftOp::LShL32x4 => "veslf", + VecShiftOp::LShL64x2 => "veslg", + VecShiftOp::LShR8x16 => "vesrlb", + VecShiftOp::LShR16x8 => "vesrlh", + VecShiftOp::LShR32x4 => "vesrlf", + VecShiftOp::LShR64x2 => "vesrlg", + VecShiftOp::AShR8x16 => "vesrab", + VecShiftOp::AShR16x8 => "vesrah", + VecShiftOp::AShR32x4 => "vesraf", + VecShiftOp::AShR64x2 => "vesrag", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let shift_reg = if shift_reg != zero_reg() { + format!("({})", pretty_print_reg(shift_reg, allocs)) + } else { + "".to_string() + }; + format!("{} {}, {}, {}{}", op, rd, rn, shift_imm, shift_reg) + } &Inst::VecSelect { rd, rn, rm, ra } => { let rd = pretty_print_reg(rd.to_reg(), allocs); let rn = pretty_print_reg(rn, allocs); @@ -2014,6 +2336,409 @@ impl Inst { let ra = pretty_print_reg(ra, allocs); format!("vsel {}, {}, {}, {}", rd, rn, rm, ra) } + &Inst::VecPermute { rd, rn, rm, ra } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + let ra = pretty_print_reg(ra, allocs); + format!("vperm {}, {}, {}, {}", rd, rn, rm, ra) + } + &Inst::VecPermuteDWImm { + rd, + rn, + rm, + idx1, + idx2, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + let m4 = (idx1 & 1) * 4 + (idx2 & 1); + format!("vpdi {}, {}, {}, {}", rd, rn, rm, m4) + } + &Inst::VecIntCmp { op, rd, rn, rm } | &Inst::VecIntCmpS { op, rd, rn, rm } => { + let op = match op { + VecIntCmpOp::CmpEq8x16 => "vceqb", + VecIntCmpOp::CmpEq16x8 => "vceqh", + VecIntCmpOp::CmpEq32x4 => "vceqf", + VecIntCmpOp::CmpEq64x2 => "vceqg", + VecIntCmpOp::SCmpHi8x16 => "vchb", + VecIntCmpOp::SCmpHi16x8 => "vchh", + VecIntCmpOp::SCmpHi32x4 => "vchf", + VecIntCmpOp::SCmpHi64x2 => "vchg", + VecIntCmpOp::UCmpHi8x16 => "vchlb", + VecIntCmpOp::UCmpHi16x8 => "vchlh", + VecIntCmpOp::UCmpHi32x4 => "vchlf", + VecIntCmpOp::UCmpHi64x2 => "vchlg", + }; + let s = match self { + &Inst::VecIntCmp { .. } => "", + &Inst::VecIntCmpS { .. } => "s", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{}{} {}, {}, {}", op, s, rd, rn, rm) + } + &Inst::VecFloatCmp { op, rd, rn, rm } | &Inst::VecFloatCmpS { op, rd, rn, rm } => { + let op = match op { + VecFloatCmpOp::CmpEq32x4 => "vfcesb", + VecFloatCmpOp::CmpEq64x2 => "vfcedb", + VecFloatCmpOp::CmpHi32x4 => "vfchsb", + VecFloatCmpOp::CmpHi64x2 => "vfchdb", + VecFloatCmpOp::CmpHiEq32x4 => "vfchesb", + VecFloatCmpOp::CmpHiEq64x2 => "vfchedb", + }; + let s = match self { + &Inst::VecFloatCmp { .. } => "", + &Inst::VecFloatCmpS { .. } => "s", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{}{} {}, {}, {}", op, s, rd, rn, rm) + } + &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + let opcode = match self { + &Inst::VecLoad { .. } => "vl", + &Inst::VecLoadRev { .. } => "vlbrq", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + let opcode = match self { + &Inst::VecStore { .. } => "vst", + &Inst::VecStoreRev { .. } => "vstbrq", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd, allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecLoadReplicate { size, rd, ref mem } + | &Inst::VecLoadReplicateRev { size, rd, ref mem } => { + let opcode = match (self, size) { + (&Inst::VecLoadReplicate { .. }, 8) => "vlrepb", + (&Inst::VecLoadReplicate { .. }, 16) => "vlreph", + (&Inst::VecLoadReplicate { .. }, 32) => "vlrepf", + (&Inst::VecLoadReplicate { .. }, 64) => "vlrepg", + (&Inst::VecLoadReplicateRev { .. }, 16) => "vlbrreph", + (&Inst::VecLoadReplicateRev { .. }, 32) => "vlbrrepf", + (&Inst::VecLoadReplicateRev { .. }, 64) => "vlbrrepg", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecMov { rd, rn } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("vlr {}, {}", rd, rn) + } + &Inst::VecCMov { rd, cond, rm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rm = pretty_print_reg(rm, allocs); + let cond = cond.invert().pretty_print_default(); + format!("j{} 10 ; vlr {}, {}", cond, rd, rm) + } + &Inst::MovToVec128 { rd, rn, rm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("vlvgp {}, {}, {}", rd, rn, rm) + } + &Inst::VecLoadConst { rd, const_data } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg(), &mut empty_allocs); + format!( + "bras {}, 20 ; data.u128 0x{:032x} ; vl {}, 0({})", + tmp, const_data, rd, tmp + ) + } + &Inst::VecLoadConstReplicate { + size, + rd, + const_data, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg(), &mut empty_allocs); + let (opcode, data) = match size { + 32 => ("vlrepf", format!("0x{:08x}", const_data as u32)), + 64 => ("vlrepg", format!("0x{:016x}", const_data)), + _ => unreachable!(), + }; + format!( + "bras {}, {} ; data.u{} {} ; {} {}, 0({})", + tmp, + 4 + size / 8, + size, + data, + opcode, + rd, + tmp + ) + } + &Inst::VecImmByteMask { rd, mask } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + format!("vgbm {}, {}", rd, mask) + } + &Inst::VecImmBitMask { + size, + rd, + start_bit, + end_bit, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let op = match size { + 8 => "vgmb", + 16 => "vgmh", + 32 => "vgmf", + 64 => "vgmg", + _ => unreachable!(), + }; + format!("{} {}, {}, {}", op, rd, start_bit, end_bit) + } + &Inst::VecImmReplicate { size, rd, imm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let op = match size { + 8 => "vrepib", + 16 => "vrepih", + 32 => "vrepif", + 64 => "vrepig", + _ => unreachable!(), + }; + format!("{} {}, {}", op, rd, imm) + } + &Inst::VecLoadLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRev { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneUndef { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRevUndef { + size, + rd, + ref mem, + lane_imm, + } => { + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecLoadLane { .. }, 8) => ("vleb", None, None), + (&Inst::VecLoadLane { .. }, 16) => ("vleh", None, None), + (&Inst::VecLoadLane { .. }, 32) => ("vlef", None, None), + (&Inst::VecLoadLane { .. }, 64) => ("vleg", None, None), + (&Inst::VecLoadLaneRev { .. }, 16) => ("vlebrh", None, None), + (&Inst::VecLoadLaneRev { .. }, 32) => ("vlebrf", None, None), + (&Inst::VecLoadLaneRev { .. }, 64) => ("vlebrg", None, None), + (&Inst::VecLoadLaneUndef { .. }, 8) => ("vleb", None, None), + (&Inst::VecLoadLaneUndef { .. }, 16) => ("vleh", None, None), + (&Inst::VecLoadLaneUndef { .. }, 32) => ("vlef", Some("le"), Some("ley")), + (&Inst::VecLoadLaneUndef { .. }, 64) => ("vleg", Some("ld"), Some("ldy")), + (&Inst::VecLoadLaneRevUndef { .. }, 16) => ("vlebrh", None, None), + (&Inst::VecLoadLaneRevUndef { .. }, 32) => ("vlebrf", None, None), + (&Inst::VecLoadLaneRevUndef { .. }, 64) => ("vlebrg", None, None), + _ => unreachable!(), + }; + + let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + if lane_imm == 0 && rd_fpr.is_some() && opcode_rx.is_some() { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, true, false, true); + let op = match &mem { + &MemArg::BXD12 { .. } => opcode_rx, + &MemArg::BXD20 { .. } => opcode_rxy, + _ => unreachable!(), + }; + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) + } else { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}, {}", mem_str, opcode_vrx, rd, mem, lane_imm) + } + } + &Inst::VecStoreLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecStoreLaneRev { + size, + rd, + ref mem, + lane_imm, + } => { + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecStoreLane { .. }, 8) => ("vsteb", None, None), + (&Inst::VecStoreLane { .. }, 16) => ("vsteh", None, None), + (&Inst::VecStoreLane { .. }, 32) => ("vstef", Some("ste"), Some("stey")), + (&Inst::VecStoreLane { .. }, 64) => ("vsteg", Some("std"), Some("stdy")), + (&Inst::VecStoreLaneRev { .. }, 16) => ("vstebrh", None, None), + (&Inst::VecStoreLaneRev { .. }, 32) => ("vstebrf", None, None), + (&Inst::VecStoreLaneRev { .. }, 64) => ("vstebrg", None, None), + _ => unreachable!(), + }; + + let (rd, rd_fpr) = pretty_print_fpr(rd, allocs); + let mem = mem.with_allocs(allocs); + if lane_imm == 0 && rd_fpr.is_some() && opcode_rx.is_some() { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, true, false, true); + let op = match &mem { + &MemArg::BXD12 { .. } => opcode_rx, + &MemArg::BXD20 { .. } => opcode_rxy, + _ => unreachable!(), + }; + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) + } else { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}, {}", mem_str, opcode_vrx, rd, mem, lane_imm,) + } + } + &Inst::VecInsertLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let op = match size { + 8 => "vlvgb", + 16 => "vlvgh", + 32 => "vlvgf", + 64 => "vlvgg", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + format!("{} {}, {}, {}{}", op, rd, rn, lane_imm, lane_reg) + } + &Inst::VecInsertLaneUndef { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let (opcode_vrs, opcode_rre) = match size { + 8 => ("vlvgb", None), + 16 => ("vlvgh", None), + 32 => ("vlvgf", None), + 64 => ("vlvgg", Some("ldgr")), + _ => unreachable!(), + }; + let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg.is_empty() && rd_fpr.is_some() + { + format!("{} {}, {}", opcode_rre.unwrap(), rd_fpr.unwrap(), rn) + } else { + format!("{} {}, {}, {}{}", opcode_vrs, rd, rn, lane_imm, lane_reg) + } + } + &Inst::VecExtractLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let (opcode_vrs, opcode_rre) = match size { + 8 => ("vlgvb", None), + 16 => ("vlgvh", None), + 32 => ("vlgvf", None), + 64 => ("vlgvg", Some("lgdr")), + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg.is_empty() && rn_fpr.is_some() + { + format!("{} {}, {}", opcode_rre.unwrap(), rd, rn_fpr.unwrap()) + } else { + format!("{} {}, {}, {}{}", opcode_vrs, rd, rn, lane_imm, lane_reg) + } + } + &Inst::VecInsertLaneImm { + size, + rd, + imm, + lane_imm, + } => { + let op = match size { + 8 => "vleib", + 16 => "vleih", + 32 => "vleif", + 64 => "vleig", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + format!("{} {}, {}, {}", op, rd, imm, lane_imm) + } + &Inst::VecReplicateLane { + size, + rd, + rn, + lane_imm, + } => { + let op = match size { + 8 => "vrepb", + 16 => "vreph", + 32 => "vrepf", + 64 => "vrepg", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("{} {}, {}, {}", op, rd, rn, lane_imm) + } &Inst::Extend { rd, rn, diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 5dfc2ec3eca7..0685d7e653f3 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -36,6 +36,12 @@ (imm $F64 x)) +;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty (vconst (u128_from_constant x)))) + (vec_imm ty x)) + + ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (null))) @@ -98,6 +104,39 @@ (rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_sload32 x) y))) (add_mem_sext32 ty y (sink_sload32 x))) +;; Add two vector registers. +(rule (lower (has_type (ty_vec128 ty) (iadd x y))) + (vec_add ty x y)) + + +;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate unsigned) two vector registers. +(rule (lower (has_type (ty_vec128 ty) (uadd_sat x y))) + (let ((sum Reg (vec_add ty x y))) + (vec_or ty sum (vec_cmphl ty x sum)))) + + +;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate signed) two vector registers. $I64X2 not supported. +(rule (lower (has_type (ty_vec128 ty) (sadd_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (vec_add (vec_widen_type ty) (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (vec_add (vec_widen_type ty) (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + + +;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers. +(rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y))) + (let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits)))) + (vec_pack (vec_widen_type ty) + (vec_add ty y (vec_lshr_by_byte y size)) + (vec_add ty x (vec_lshr_by_byte x size))))) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -129,6 +168,28 @@ (rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_sload32 y)))) (sub_mem_sext32 ty x (sink_sload32 y))) +;; Sub two vector registers. +(rule (lower (has_type (ty_vec128 ty) (isub x y))) + (vec_sub ty x y)) + + +;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate unsigned) two vector registers. +(rule (lower (has_type (ty_vec128 ty) (usub_sat x y))) + (vec_and ty (vec_sub ty x y) (vec_cmphl ty x y))) + + +;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate signed) two vector registers. $I64X2 not supported. +(rule (lower (has_type (ty_vec128 ty) (ssub_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (vec_sub (vec_widen_type ty) (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (vec_sub (vec_widen_type ty) (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -141,6 +202,10 @@ (rule (lower (has_type (fits_in_64 ty) (iabs (sext32_value x)))) (abs_reg_sext32 ty x)) +;; Absolute value of a vector register. +(rule (lower (has_type (ty_vec128 ty) (iabs x))) + (vec_abs ty x)) + ;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -193,6 +258,45 @@ (rule (lower (has_type (fits_in_64 ty) (ineg (sext32_value x)))) (neg_reg_sext32 ty x)) +;; Negate a vector register. +(rule (lower (has_type (ty_vec128 ty) (ineg x))) + (vec_neg ty x)) + + +;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned maximum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (umax x y))) + (vec_umax ty x y)) + + +;;;; Rules for `umin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned minimum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (umin x y))) + (vec_umin ty x y)) + + +;;;; Rules for `imax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Signed maximum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (imax x y))) + (vec_smax ty x y)) + + +;;;; Rules for `imin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Signed minimum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (imin x y))) + (vec_smin ty x y)) + + +;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned average of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (avg_round x y))) + (vec_uavg ty x y)) + ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -238,6 +342,24 @@ (rule (lower (has_type (fits_in_64 ty) (imul (sinkable_sload32 x) y))) (mul_mem_sext32 ty y (sink_sload32 x))) +;; Multiply two vector registers, using a helper. +(decl vec_mul_impl (Type Reg Reg) Reg) +(rule (lower (has_type (ty_vec128 ty) (imul x y))) + (vec_mul_impl ty x y)) + +;; Multiply two vector registers - byte, halfword, and word. +(rule (vec_mul_impl $I8X16 x y) (vec_mul $I8X16 x y)) +(rule (vec_mul_impl $I16X8 x y) (vec_mul $I16X8 x y)) +(rule (vec_mul_impl $I32X4 x y) (vec_mul $I32X4 x y)) + +;; Multiply two vector registers - doubleword. Has to be scalarized. +(rule (vec_mul_impl $I64X2 x y) + (mov_to_vec128 $I64X2 + (mul_reg $I64 (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg))) + (mul_reg $I64 (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg))))) + ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -260,6 +382,22 @@ (let ((pair RegPair (umul_wide x y))) (copy_reg $I64 (regpair_hi pair)))) +;; Multiply high part unsigned, vector types with 8-, 16-, or 32-bit elements. +(rule (lower (has_type $I8X16 (umulhi x y))) (vec_umulhi $I8X16 x y)) +(rule (lower (has_type $I16X8 (umulhi x y))) (vec_umulhi $I16X8 x y)) +(rule (lower (has_type $I32X4 (umulhi x y))) (vec_umulhi $I32X4 x y)) + +;; Multiply high part unsigned, vector types with 64-bit elements. +;; Has to be scalarized. +(rule (lower (has_type $I64X2 (umulhi x y))) + (let ((pair_0 RegPair (umul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg)))) + (res_0 Reg (copy_reg $I64 (regpair_hi pair_0))) + (pair_1 RegPair (umul_wide (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg)))) + (res_1 Reg (copy_reg $I64 (regpair_hi pair_1)))) + (mov_to_vec128 $I64X2 res_0 res_1))) + ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -282,6 +420,55 @@ (let ((pair RegPair (smul_wide x y))) (copy_reg $I64 (regpair_hi pair)))) +;; Multiply high part signed, vector types with 8-, 16-, or 32-bit elements. +(rule (lower (has_type $I8X16 (smulhi x y))) (vec_smulhi $I8X16 x y)) +(rule (lower (has_type $I16X8 (smulhi x y))) (vec_smulhi $I16X8 x y)) +(rule (lower (has_type $I32X4 (smulhi x y))) (vec_smulhi $I32X4 x y)) + +;; Multiply high part unsigned, vector types with 64-bit elements. +;; Has to be scalarized. +(rule (lower (has_type $I64X2 (smulhi x y))) + (let ((pair_0 RegPair (smul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg)))) + (res_0 Reg (copy_reg $I64 (regpair_hi pair_0))) + (pair_1 RegPair (smul_wide (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg)))) + (res_1 Reg (copy_reg $I64 (regpair_hi pair_1)))) + (mov_to_vec128 $I64X2 res_0 res_1))) + + +;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Widening pairwise dot product of two vector registers. +(rule (lower (has_type dst_ty (widening_pairwise_dot_product_s + x @ (value_type src_ty) y))) + (vec_add dst_ty (vec_smul_even src_ty x y) + (vec_smul_odd src_ty x y))) + + +;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Fixed-point multiplication of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (sqmul_round_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (sqmul_impl (vec_widen_type ty) + (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (sqmul_impl (vec_widen_type ty) + (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + +;; Helper to perform the rounded multiply in the wider type. +(decl sqmul_impl (Type Reg Reg) Reg) +(rule (sqmul_impl $I32X4 x y) + (vec_ashr_imm $I32X4 (vec_add $I32X4 (vec_mul_impl $I32X4 x y) + (vec_imm_bit_mask $I32X4 17 17)) + 15)) +(rule (sqmul_impl $I64X2 x y) + (vec_ashr_imm $I64X2 (vec_add $I64X2 (vec_mul_impl $I64X2 x y) + (vec_imm_bit_mask $I64X2 33 33)) + 31)) + ;;;; Rules for `udiv` and `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -487,6 +674,15 @@ (let ((masked_amt u8 (mask_amt_imm ty y))) (lshl_imm ty x masked_amt))) +;; Vector shift left, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (ishl x y))) + (vec_lshl_reg ty x y)) + +;; Vector shift left, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (ishl x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_lshl_imm ty x masked_amt))) + ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -504,6 +700,15 @@ (masked_amt u8 (mask_amt_imm ty y))) (lshr_imm (ty_ext32 ty) ext_reg masked_amt))) +;; Vector shift right logical, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (ushr x y))) + (vec_lshr_reg ty x y)) + +;; Vector shift right logical, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (ushr x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_lshr_imm ty x masked_amt))) + ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -521,6 +726,15 @@ (masked_amt u8 (mask_amt_imm ty y))) (ashr_imm (ty_ext32 ty) ext_reg masked_amt))) +;; Vector shift right arithmetic, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (sshr x y))) + (vec_ashr_reg ty x y)) + +;; Vector shift right arithmetic, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (sshr x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_ashr_imm ty x masked_amt))) + ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -556,6 +770,15 @@ (or_reg ty (lshl_imm ext_ty ext_reg masked_pos_amt) (lshr_imm ext_ty ext_reg masked_neg_amt)))) +;; Vector rotate left, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (rotl x y))) + (vec_rot_reg ty x y)) + +;; Vector rotate left, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (rotl x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_rot_imm ty x masked_amt))) + ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -594,6 +817,19 @@ (or_reg ty (lshl_imm ext_ty ext_reg masked_neg_amt) (lshr_imm ext_ty ext_reg masked_pos_amt)))) +;; Vector rotate right, shift amount in register. +;; Implemented as rotate left with negated rotate amount. +(rule (lower (has_type (ty_vec128 ty) (rotr x y))) + (let ((negated_amt Reg (neg_reg $I32 y))) + (vec_rot_reg ty x negated_amt))) + +;; Vector rotate right, immediate shift amount. +;; Implemented as rotate left with negated rotate amount. +(rule (lower (has_type (ty_vec128 ty) (rotr x (i64_from_negated_value y)))) + (let ((negated_amt u8 (mask_amt_imm ty y))) + (vec_rot_imm ty x negated_amt))) + + ;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Always a no-op. @@ -623,6 +859,49 @@ (put_in_reg_sext64 x)) +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y)) + (vec_pack_ssat ty y x)) + + +;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y)) + (vec_pack_usat ty y x)) + + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y)) + (let ((zero Reg (vec_imm ty 0))) + (vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero)))) + + +;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_low x @ (value_type (ty_vec128 ty)))) + (vec_unpacks_low ty x)) + + +;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_high x @ (value_type (ty_vec128 ty)))) + (vec_unpacks_high ty x)) + + +;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_low x @ (value_type (ty_vec128 ty)))) + (vec_unpacku_low ty x)) + + +;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_high x @ (value_type (ty_vec128 ty)))) + (vec_unpacku_high ty x)) + + ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; z15 version using a single instruction (NOR). @@ -634,6 +913,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bnot x))) (not_reg ty x)) +;; Vector version using vector NOR. +(rule (lower (has_type (ty_vec128 ty) (bnot x))) + (vec_not ty x)) + ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -657,6 +940,9 @@ (rule (lower (has_type (fits_in_64 ty) (band (sinkable_load_32_64 x) y))) (and_mem ty y (sink_load x))) +;; And two vector registers. +(rule (lower (has_type (ty_vec128 ty) (band x y))) + (vec_and ty x y)) ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -680,6 +966,10 @@ (rule (lower (has_type (fits_in_64 ty) (bor (sinkable_load_32_64 x) y))) (or_mem ty y (sink_load x))) +;; Or two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bor x y))) + (vec_or ty x y)) + ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -699,6 +989,10 @@ (rule (lower (has_type (fits_in_64 ty) (bxor (sinkable_load_32_64 x) y))) (xor_mem ty y (sink_load x))) +;; Xor two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bxor x y))) + (vec_xor ty x y)) + ;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -710,6 +1004,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (band_not x y))) (and_reg ty x (not_reg ty y))) +;; And-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (band_not x y))) + (vec_and_not ty x y)) + ;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -721,6 +1019,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bor_not x y))) (or_reg ty x (not_reg ty y))) +;; Or-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bor_not x y))) + (vec_or_not ty x y)) + ;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -732,6 +1034,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bxor_not x y))) (not_reg ty (xor_reg ty x y))) +;; Xor-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bxor_not x y))) + (vec_not_xor ty x y)) + ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -749,6 +1055,17 @@ (if_false Reg (and_reg ty z (not_reg ty rx)))) (or_reg ty if_false if_true))) +;; Bitselect vector registers. +(rule (lower (has_type (ty_vec128 ty) (bitselect x y z))) + (vec_select ty y z x)) + + +;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Vector select. +(rule (lower (has_type (ty_vec128 ty) (vselect x y z))) + (vec_select ty y z x)) + ;;;; Rules for `breduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -917,6 +1234,10 @@ (cnt1 Reg (add_reg $I64 cnt2 (lshl_imm $I64 cnt2 8)))) (lshr_imm $I64 cnt1 56))) +;; Population count for vector types. +(rule (lower (has_type (ty_vec128 ty) (popcnt x))) + (vec_popcnt ty x)) + ;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -960,6 +1281,20 @@ (fmax_reg ty x y)) +;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Minimum of two registers. +(rule (lower (has_type ty (fmin_pseudo x y))) + (fmin_pseudo_reg ty x y)) + + +;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Maximum of two registers. +(rule (lower (has_type ty (fmax_pseudo x y))) + (fmax_pseudo_reg ty x y)) + + ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Copysign of two registers. @@ -967,6 +1302,10 @@ (vec_select $F32 x y (imm $F32 2147483647))) (rule (lower (has_type $F64 (fcopysign x y))) (vec_select $F64 x y (imm $F64 9223372036854775807))) +(rule (lower (has_type $F32X4 (fcopysign x y))) + (vec_select $F32X4 x y (vec_imm_bit_mask $F32X4 1 31))) +(rule (lower (has_type $F64X2 (fcopysign x y))) + (vec_select $F64X2 x y (vec_imm_bit_mask $F64X2 1 63))) ;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1028,35 +1367,73 @@ ;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Promote a register. -(rule (lower (has_type dst_ty (fpromote x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) (fpromote x @ (value_type src_ty)))) (fpromote_reg dst_ty src_ty x)) +;;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Promote a register. +(rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4)))) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x))) + + ;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Demote a register. -(rule (lower (has_type dst_ty (fdemote x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) (fdemote x @ (value_type src_ty)))) (fdemote_reg dst_ty src_ty (FpuRoundMode.Current) x)) +;;;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Demote a register. +(rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2)))) + (let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x))) + (vec_permute $F32X4 dst (vec_imm $F32X4 0) + (vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16 + 0 1 2 3 8 9 10 11))))) + + ;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Convert a 32-bit or smaller unsigned integer to $F32 (z15 instruction). (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty)))))) (fcvt_from_uint_reg $F32 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr32 (put_in_reg_zext32 x)))) + (put_in_reg_zext32 x))) ;; Convert a 64-bit or smaller unsigned integer to $F32, via an intermediate $F64. (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_64 ty))))) (fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven) (fcvt_from_uint_reg $F64 (FpuRoundMode.ShorterPrecision) - (mov_to_fpr64 (put_in_reg_zext64 x))))) + (put_in_reg_zext64 x)))) ;; Convert a 64-bit or smaller unsigned integer to $F64. (rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_64 ty))))) (fcvt_from_uint_reg $F64 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr64 (put_in_reg_zext64 x)))) + (put_in_reg_zext64 x))) + +;; Convert $I32X4 to $F32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4) + (fcvt_from_uint x @ (value_type $I32X4)))) + (fcvt_from_uint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x)) + +;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4) + (fcvt_from_uint x @ (value_type $I32X4)))) + (vec_permute $F32X4 + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacku_high $I32X4 x))) + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacku_low $I32X4 x))) + (vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27)))) + +;; Convert $I64X2 to $F64X2. +(rule (lower (has_type $F64X2 (fcvt_from_uint x @ (value_type $I64X2)))) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x)) ;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1065,25 +1442,55 @@ (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty)))))) (fcvt_from_sint_reg $F32 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr32 (put_in_reg_sext32 x)))) + (put_in_reg_sext32 x))) ;; Convert a 64-bit or smaller signed integer to $F32, via an intermediate $F64. (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_64 ty))))) (fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven) (fcvt_from_sint_reg $F64 (FpuRoundMode.ShorterPrecision) - (mov_to_fpr64 (put_in_reg_sext64 x))))) + (put_in_reg_sext64 x)))) ;; Convert a 64-bit or smaller signed integer to $F64. (rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_64 ty))))) (fcvt_from_sint_reg $F64 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr64 (put_in_reg_sext64 x)))) + (put_in_reg_sext64 x))) + +;; Convert $I32X4 to $F32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4) + (fcvt_from_sint x @ (value_type $I32X4)))) + (fcvt_from_sint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x)) + +;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4) + (fcvt_from_sint x @ (value_type $I32X4)))) + (vec_permute $F32X4 + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacks_high $I32X4 x))) + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacks_low $I32X4 x))) + (vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27)))) + +;; Convert $I64X2 to $F64X2. +(rule (lower (has_type $F64X2 (fcvt_from_sint x @ (value_type $I64X2)))) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x)) + + +;;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Convert the low half of a $I32X4 to a $F64X2. +(rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4)))) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (vec_unpacks_low $I32X4 x))) ;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to an unsigned integer value. +;; Convert a scalar floating-point value in a register to an unsigned integer. ;; Traps if the input cannot be represented in the output type. -(rule (lower (has_type dst_ty (fcvt_to_uint x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_uint x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; First, check whether the input is a NaN, and trap if so. (_1 Reg (trap_if (fcmp_reg src_ty src src) @@ -1104,9 +1511,10 @@ ;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to a signed integer value. +;; Convert a scalar floating-point value in a register to a signed integer. ;; Traps if the input cannot be represented in the output type. -(rule (lower (has_type dst_ty (fcvt_to_sint x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_sint x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; First, check whether the input is a NaN, and trap if so. (_1 Reg (trap_if (fcmp_reg src_ty src src) @@ -1128,8 +1536,9 @@ ;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to an unsigned integer value. -(rule (lower (has_type dst_ty (fcvt_to_uint_sat x @ (value_type src_ty)))) +;; Convert a scalar floating-point value in a register to an unsigned integer. +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_uint_sat x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; Perform the conversion using the larger type size. (flt_ty Type (fcvt_flt_ty dst_ty src_ty)) @@ -1139,11 +1548,30 @@ ;; Clamp the output to the destination type bounds. (uint_sat_reg dst_ty int_ty dst))) +;; Convert $F32X4 to $I32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4) + (fcvt_to_uint_sat x @ (value_type $F32X4)))) + (fcvt_to_uint_reg $F32X4 (FpuRoundMode.ToZero) x)) + +;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4) + (fcvt_to_uint_sat x @ (value_type $F32X4)))) + (vec_pack_usat $I64X2 + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 x x))) + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x))))) + +;; Convert $F64X2 to $I64X2. +(rule (lower (has_type $I64X2 (fcvt_to_uint_sat x @ (value_type $F64X2)))) + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) x)) + ;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to a signed integer value. -(rule (lower (has_type dst_ty (fcvt_to_sint_sat x @ (value_type src_ty)))) +;; Convert a scalar floating-point value in a register to a signed integer. +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_sint_sat x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; Perform the conversion using the larger type size. (flt_ty Type (fcvt_flt_ty dst_ty src_ty)) @@ -1160,24 +1588,578 @@ ;; Clamp the output to the destination type bounds. (sint_sat_reg dst_ty int_ty sat))) +;; Convert $F32X4 to $I32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4) + (fcvt_to_sint_sat src @ (value_type $F32X4)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I32X4 + (fcvt_to_sint_reg $F32X4 (FpuRoundMode.ToZero) src) + (vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src))) + +;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4) + (fcvt_to_sint_sat src @ (value_type $F32X4)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I32X4 + (vec_pack_ssat $I64X2 + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 src src))) + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 src src)))) + (vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src))) + +;; Convert $F64X2 to $I64X2. +(rule (lower (has_type $I64X2 (fcvt_to_sint_sat src @ (value_type $F64X2)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I64X2 + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) src) + (vec_imm $I64X2 0) (vec_fcmpeq $F64X2 src src))) + ;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Reinterpret a 64-bit integer value as floating-point. (rule (lower (has_type $F64 (bitcast x @ (value_type $I64)))) - (mov_to_fpr64 x)) + (vec_insert_lane_undef $F64X2 x 0 (zero_reg))) ;; Reinterpret a 64-bit floating-point value as integer. (rule (lower (has_type $I64 (bitcast x @ (value_type $F64)))) - (mov_from_fpr64 x)) + (vec_extract_lane $F64X2 x 0 (zero_reg))) -;; Reinterpret a 32-bit integer value as floating-point (via $I64). +;; Reinterpret a 32-bit integer value as floating-point. (rule (lower (has_type $F32 (bitcast x @ (value_type $I32)))) - (mov_to_fpr32 x)) + (vec_insert_lane_undef $F32X4 x 0 (zero_reg))) -;; Reinterpret a 32-bit floating-point value as integer (via $I64). +;; Reinterpret a 32-bit floating-point value as integer. (rule (lower (has_type $I32 (bitcast x @ (value_type $F32)))) - (mov_from_fpr32 x)) + (vec_extract_lane $F32X4 x 0 (zero_reg))) + + +;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Raw bitcast is always a no-op. +(rule (lower (raw_bitcast x)) x) + + +;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Insert vector lane from general-purpose register. +(rule (lower (insertlane x @ (value_type ty) + y @ (value_type (ty_int_bool_ref_scalar_64 _)) + (u8_from_uimm8 idx))) + (vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg))) + +;; Insert vector lane from floating-point register. +(rule (lower (insertlane x @ (value_type ty) + y @ (value_type (ty_scalar_float _)) + (u8_from_uimm8 idx))) + (vec_move_lane_and_insert ty x (be_lane_idx ty idx) y 0)) + +;; Insert vector lane from another vector lane. +(rule (lower (insertlane x @ (value_type ty) + (extractlane y (u8_from_uimm8 src_idx)) + (u8_from_uimm8 dst_idx))) + (vec_move_lane_and_insert ty x (be_lane_idx ty dst_idx) + y (be_lane_idx ty src_idx))) + +;; Insert vector lane from signed 16-bit immediate. +(rule (lower (insertlane x @ (value_type ty) (i16_from_value y) + (u8_from_uimm8 idx))) + (vec_insert_lane_imm ty x y (be_lane_idx ty idx))) + +;; Insert vector lane from big-endian memory. +(rule (lower (insertlane x @ (value_type ty) (sinkable_load y) + (u8_from_uimm8 idx))) + (vec_load_lane ty x (sink_load y) (be_lane_idx ty idx))) + +;; Insert vector lane from little-endian memory. +(rule (lower (insertlane x @ (value_type ty) (sinkable_load_little y) + (u8_from_uimm8 idx))) + (vec_load_lane_little ty x (sink_load y) (be_lane_idx ty idx))) + + +;; Helper to extract one lane from a vector and insert it into another. +(decl vec_move_lane_and_insert (Type Reg u8 Reg u8) Reg) + +;; For 64-bit elements we always use VPDI. +(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 0 src src_idx) + (vec_permute_dw_imm ty src src_idx dst 1)) +(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 1 src src_idx) + (vec_permute_dw_imm ty dst 0 src src_idx)) + +;; If source and destination index are the same, use vec_select. +(rule (vec_move_lane_and_insert ty dst idx src idx) + (vec_select ty src + dst (vec_imm_byte_mask ty (lane_byte_mask ty idx)))) + +;; Otherwise replicate source first and then use vec_select. +(rule (vec_move_lane_and_insert ty dst dst_idx src src_idx) + (vec_select ty (vec_replicate_lane ty src src_idx) + dst (vec_imm_byte_mask ty (lane_byte_mask ty dst_idx)))) + + +;; Helper to implement a generic little-endian variant of vec_load_lane. +(decl vec_load_lane_little (Type Reg MemArg u8) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_lane_little ty @ (multi_lane 8 _) dst addr lane_imm) + (vec_load_lane ty dst addr lane_imm)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) + +;; On z14, use a little-endian load to GPR followed by vec_insert_lane. +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev16 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev32 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev64 addr) lane_imm (zero_reg))) + +;; Helper to implement a generic little-endian variant of vec_load_lane_undef. +(decl vec_load_lane_little_undef (Type MemArg u8) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_lane_little_undef ty @ (multi_lane 8 _) addr lane_imm) + (vec_load_lane_undef ty addr lane_imm)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) + +;; On z14, use a little-endian load to GPR followed by vec_insert_lane_undef. +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev16 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev32 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev64 addr) lane_imm (zero_reg))) + + +;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Extract vector lane to general-purpose register. +(rule (lower (has_type (ty_int_bool_ref_scalar_64 _) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))) + (vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg))) + +;; Extract vector lane to floating-point register. +(rule (lower (has_type (ty_scalar_float _) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))) + (vec_replicate_lane ty x (be_lane_idx ty idx))) + +;; Extract vector lane and store to big-endian memory. +(rule (lower (store flags @ (bigendian) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)) + addr offset)) + (side_effect (vec_store_lane ty x + (lower_address flags addr offset) (be_lane_idx ty idx)))) + +;; Extract vector lane and store to little-endian memory. +(rule (lower (store flags @ (littleendian) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)) + addr offset)) + (side_effect (vec_store_lane_little ty x + (lower_address flags addr offset) (be_lane_idx ty idx)))) + + +;; Helper to implement a generic little-endian variant of vec_store_lane. +(decl vec_store_lane_little (Type Reg MemArg u8) SideEffectNoResult) + +;; 8-byte little-endian stores can be performed via a normal store. +(rule (vec_store_lane_little ty @ (multi_lane 8 _) src addr lane_imm) + (vec_store_lane ty src addr lane_imm)) + +;; On z15, we have instructions to perform little-endian stores. +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) + +;; On z14, use vec_extract_lane followed by a little-endian store from GPR. +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) src addr lane_imm) + (storerev16 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) src addr lane_imm) + (storerev32 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) src addr lane_imm) + (storerev64 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) + + +;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Load replicated value from general-purpose register. +(rule (lower (has_type ty (splat + x @ (value_type (ty_int_bool_ref_scalar_64 _))))) + (vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0)) + +;; Load replicated value from floating-point register. +(rule (lower (has_type ty (splat + x @ (value_type (ty_scalar_float _))))) + (vec_replicate_lane ty x 0)) + +;; Load replicated value from vector lane. +(rule (lower (has_type ty (splat (extractlane x (u8_from_uimm8 idx))))) + (vec_replicate_lane ty x (be_lane_idx ty idx))) + +;; Load replicated 16-bit immediate value. +(rule (lower (has_type ty (splat (i16_from_value x)))) + (vec_imm_replicate ty x)) + +;; Load replicated value from big-endian memory. +(rule (lower (has_type ty (splat (sinkable_load x)))) + (vec_load_replicate ty (sink_load x))) + +;; Load replicated value from little-endian memory. +(rule (lower (has_type ty (splat (sinkable_load_little x)))) + (vec_load_replicate_little ty (sink_load x))) + + +;; Helper to implement a generic little-endian variant of vec_load_replicate +(decl vec_load_replicate_little (Type MemArg) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_replicate_little ty @ (multi_lane 8 _) addr) + (vec_load_replicate ty addr)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) addr) + (vec_load_replicate_rev ty addr)) +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) addr) + (vec_load_replicate_rev ty addr)) +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) addr) + (vec_load_replicate_rev ty addr)) + +;; On z14, use a little-endian load (via GPR) and replicate. +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) + + +;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Load scalar value from general-purpose register. +(rule (lower (has_type ty (scalar_to_vector + x @ (value_type (ty_int_bool_ref_scalar_64 _))))) + (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg))) + +;; Load scalar value from floating-point register. +(rule (lower (has_type ty (scalar_to_vector + x @ (value_type (ty_scalar_float _))))) + (vec_move_lane_and_zero ty (be_lane_idx ty 0) x 0)) + +;; Load scalar value from vector lane. +(rule (lower (has_type ty (scalar_to_vector + (extractlane x (u8_from_uimm8 idx))))) + (vec_move_lane_and_zero ty (be_lane_idx ty 0) x (be_lane_idx ty idx))) + +;; Load scalar 16-bit immediate value. +(rule (lower (has_type ty (scalar_to_vector (i16_from_value x)))) + (vec_insert_lane_imm ty (vec_imm ty 0) x (be_lane_idx ty 0))) + +;; Load scalar value from big-endian memory. +(rule (lower (has_type ty (scalar_to_vector (sinkable_load x)))) + (vec_load_lane ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0))) + +;; Load scalar value lane from little-endian memory. +(rule (lower (has_type ty (scalar_to_vector (sinkable_load_little x)))) + (vec_load_lane_little ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0))) + + +;; Helper to extract one lane from a vector and insert it into a zero vector. +(decl vec_move_lane_and_zero (Type u8 Reg u8) Reg) + +;; For 64-bit elements we always use VPDI. +(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 0 src src_idx) + (vec_permute_dw_imm ty src src_idx (vec_imm ty 0) 0)) +(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 1 src src_idx) + (vec_permute_dw_imm ty (vec_imm ty 0) 0 src src_idx)) + +;; If source and destination index are the same, simply mask to this lane. +(rule (vec_move_lane_and_zero ty idx src idx) + (vec_and ty src + (vec_imm_byte_mask ty (lane_byte_mask ty idx)))) + +;; Otherwise replicate source first and then mask to the lane. +(rule (vec_move_lane_and_zero ty dst_idx src src_idx) + (vec_and ty (vec_replicate_lane ty src src_idx) + (vec_imm_byte_mask ty (lane_byte_mask ty dst_idx)))) + + +;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; General case: use vec_permute and then mask off zero lanes. +(rule (lower (shuffle x y (shuffle_mask permute_mask and_mask))) + (vec_and $I8X16 (vec_imm_byte_mask $I8X16 and_mask) + (vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask)))) + +;; If the pattern has no zero lanes, just a vec_permute suffices. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask))) + +;; Special patterns that can be implemented via MERGE HIGH. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) permute_mask) + (vec_merge_high $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23) permute_mask) + (vec_merge_high $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23) permute_mask) + (vec_merge_high $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23) permute_mask) + (vec_merge_high $I8X16 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7) permute_mask) + (vec_merge_high $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7) permute_mask) + (vec_merge_high $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7) permute_mask) + (vec_merge_high $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7) permute_mask) + (vec_merge_high $I8X16 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7) permute_mask) + (vec_merge_high $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7) permute_mask) + (vec_merge_high $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7) permute_mask) + (vec_merge_high $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7) permute_mask) + (vec_merge_high $I8X16 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23) permute_mask) + (vec_merge_high $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23) permute_mask) + (vec_merge_high $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23) permute_mask) + (vec_merge_high $I16X8 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23) permute_mask) + (vec_merge_high $I8X16 y y)) + +;; Special patterns that can be implemented via MERGE LOW. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) permute_mask) + (vec_merge_low $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31) permute_mask) + (vec_merge_low $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31) permute_mask) + (vec_merge_low $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31) permute_mask) + (vec_merge_low $I8X16 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15) permute_mask) + (vec_merge_low $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15) permute_mask) + (vec_merge_low $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15) permute_mask) + (vec_merge_low $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15) permute_mask) + (vec_merge_low $I8X16 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15) permute_mask) + (vec_merge_low $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15) permute_mask) + (vec_merge_low $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15) permute_mask) + (vec_merge_low $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15) permute_mask) + (vec_merge_low $I8X16 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31) permute_mask) + (vec_merge_low $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31) permute_mask) + (vec_merge_low $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31) permute_mask) + (vec_merge_low $I16X8 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31) permute_mask) + (vec_merge_low $I8X16 y y)) + +;; Special patterns that can be implemented via PACK. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31) permute_mask) + (vec_pack $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31) permute_mask) + (vec_pack $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) permute_mask) + (vec_pack $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 20 21 22 23 28 29 30 31 4 5 6 7 12 13 14 15) permute_mask) + (vec_pack $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 18 19 22 23 26 27 30 31 2 3 6 7 10 11 14 15) permute_mask) + (vec_pack $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 17 19 21 23 25 27 29 31 1 3 5 7 9 11 13 15) permute_mask) + (vec_pack $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15) permute_mask) + (vec_pack $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 2 3 6 7 10 11 14 15 2 3 6 7 10 11 14 15) permute_mask) + (vec_pack $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 1 3 5 7 9 11 13 15 1 3 5 7 9 11 13 15) permute_mask) + (vec_pack $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 20 21 22 23 28 29 30 31 20 21 22 23 28 29 30 31) permute_mask) + (vec_pack $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 18 19 22 23 26 27 30 31 18 19 22 23 26 27 30 31) permute_mask) + (vec_pack $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 17 19 21 23 25 27 29 31 17 19 21 23 25 27 29 31) permute_mask) + (vec_pack $I16X8 y y)) + +;; Special patterns that can be implemented via UNPACK HIGH. +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 0 1 2 3 _ _ _ _ 4 5 6 7) permute_mask) + (vec_unpacku_high $I32X4 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 0 1 _ _ 2 3 _ _ 4 5 _ _ 6 7) permute_mask) + (vec_unpacku_high $I16X8 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 0 _ 1 _ 2 _ 3 _ 4 _ 5 _ 6 _ 7) permute_mask) + (vec_unpacku_high $I8X16 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 16 17 18 19 _ _ _ _ 20 21 22 23) permute_mask) + (vec_unpacku_high $I32X4 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 16 17 _ _ 18 19 _ _ 20 21 _ _ 22 23) permute_mask) + (vec_unpacku_high $I16X8 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 16 _ 17 _ 18 _ 19 _ 20 _ 21 _ 22 _ 23) permute_mask) + (vec_unpacku_high $I8X16 y)) + +;; Special patterns that can be implemented via UNPACK LOW. +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 8 9 10 11 _ _ _ _ 12 13 14 15) permute_mask) + (vec_unpacku_low $I32X4 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 8 9 _ _ 10 11 _ _ 12 13 _ _ 14 15) permute_mask) + (vec_unpacku_low $I16X8 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 8 _ 9 _ 10 _ 11 _ 12 _ 13 _ 14 _ 15) permute_mask) + (vec_unpacku_low $I8X16 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 24 25 26 27 _ _ _ _ 28 29 30 31) permute_mask) + (vec_unpacku_low $I32X4 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 24 25 _ _ 26 27 _ _ 28 29 _ _ 30 31) permute_mask) + (vec_unpacku_low $I16X8 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 24 _ 25 _ 26 _ 27 _ 28 _ 29 _ 30 _ 31) permute_mask) + (vec_unpacku_low $I8X16 y)) + +;; Special patterns that can be implemented via PERMUTE DOUBLEWORD IMMEDIATE. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 24 25 26 27 28 29 30 31) permute_mask) + (vec_permute_dw_imm $I8X16 x 0 y 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) permute_mask) + (vec_permute_dw_imm $I8X16 x 1 y 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15) permute_mask) + (vec_permute_dw_imm $I8X16 y 0 x 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 0 1 2 3 4 5 6 7) permute_mask) + (vec_permute_dw_imm $I8X16 y 1 x 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) permute_mask) + (vec_permute_dw_imm $I8X16 x 0 x 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7) permute_mask) + (vec_permute_dw_imm $I8X16 x 1 x 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) permute_mask) + (vec_permute_dw_imm $I8X16 y 0 y 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23) permute_mask) + (vec_permute_dw_imm $I8X16 y 1 y 0)) + + +;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We need to modify the lane mask at runtime in two ways: +;; - convert from little-endian to big-endian lane numbering +;; - handle mask elements outside the range 0..15 by zeroing the lane +;; +;; To do so efficiently, we compute: +;; permute-lane-element := umax (239, ~ swizzle-lane-element) +;; which has the following effect: +;; elements 0 .. 15 --> 255 .. 240 (i.e. 31 .. 16 mod 32) +;; everything else --> 239 (i.e. 15 mod 32) +;; +;; Then, we can use a single permute instruction with +;; a zero vector as first operand (covering lane 15) +;; the input vector as second operand (covering lanes 16 .. 31) +;; to implement the required swizzle semantics. + +(rule (lower (has_type (ty_vec128 ty) (swizzle x y))) + (vec_permute ty (vec_imm ty 0) x + (vec_umax $I8X16 (vec_imm_splat $I8X16 239) + (vec_not $I8X16 y)))) ;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1249,35 +2231,39 @@ (rule (lower (has_type $R64 (load flags @ (littleendian) addr offset))) (loadrev64 (lower_address flags addr offset))) -;; Load 32-bit big-endian floating-point values. +;; Load 32-bit big-endian floating-point values (as vector lane). (rule (lower (has_type $F32 (load flags @ (bigendian) addr offset))) - (fpu_load32 (lower_address flags addr offset))) + (vec_load_lane_undef $F32X4 (lower_address flags addr offset) 0)) -;; Load 32-bit little-endian floating-point values (z15 instruction). -(rule (lower (has_type (and (vxrs_ext2_enabled) $F32) - (load flags @ (littleendian) addr offset))) - (fpu_loadrev32 (lower_address flags addr offset))) +;; Load 32-bit little-endian floating-point values (as vector lane). +(rule (lower (has_type $F32 (load flags @ (littleendian) addr offset))) + (vec_load_lane_little_undef $F32X4 (lower_address flags addr offset) 0)) -;; Load 32-bit little-endian floating-point values (via GPR on z14). -(rule (lower (has_type (and (vxrs_ext2_disabled) $F32) - (load flags @ (littleendian) addr offset))) - (let ((gpr Reg (loadrev32 (lower_address flags addr offset)))) - (mov_to_fpr32 gpr))) - -;; Load 64-bit big-endian floating-point values. +;; Load 64-bit big-endian floating-point values (as vector lane). (rule (lower (has_type $F64 (load flags @ (bigendian) addr offset))) - (fpu_load64 (lower_address flags addr offset))) + (vec_load_lane_undef $F64X2 (lower_address flags addr offset) 0)) -;; Load 64-bit little-endian floating-point values (z15 instruction). -(rule (lower (has_type (and (vxrs_ext2_enabled) $F64) - (load flags @ (littleendian) addr offset))) - (fpu_loadrev64 (lower_address flags addr offset))) +;; Load 64-bit little-endian floating-point values (as vector lane). +(rule (lower (has_type $F64 (load flags @ (littleendian) addr offset))) + (vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0)) -;; Load 64-bit little-endian floating-point values (via GPR on z14). -(rule (lower (has_type (and (vxrs_ext2_disabled) $F64) - (load flags @ (littleendian) addr offset))) - (let ((gpr Reg (loadrev64 (lower_address flags addr offset)))) - (mov_to_fpr64 gpr))) +;; Load 128-bit big-endian vector values. +(rule (lower (has_type (ty_vec128 ty) (load flags @ (bigendian) addr offset))) + (vec_load ty (lower_address flags addr offset))) + +;; Load 128-bit little-endian vector values (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) (ty_vec128 ty)) + (load flags @ (littleendian) addr offset))) + (vec_loadrev ty (lower_address flags addr offset))) + +;; Load 128-bit little-endian vector values (via GPRs on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) (ty_vec128 ty)) + (load flags @ (littleendian) addr offset))) + (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) + (hi_addr MemArg (lower_address_bias flags addr offset 8)) + (lo_val Reg (loadrev64 lo_addr)) + (hi_val Reg (loadrev64 hi_addr))) + (mov_to_vec128 ty hi_val lo_val))) ;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1380,6 +2366,69 @@ (sext64_reg $I32 reg32))) +;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned 8->16 bit extension, big-endian source value. +(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I8X16 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 8->16 bit extension, little-endian source value. +(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I8X16 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 8->16 bit extension, big-endian source value. +(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I8X16 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 8->16 bit extension, little-endian source value. +(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I8X16 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 16->32 bit extension, big-endian source value. +(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I16X8 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 16->32 bit extension, little-endian source value. +(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I16X8 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 16->32 bit extension, big-endian source value. +(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I16X8 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 16->32 bit extension, little-endian source value. +(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I16X8 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 32->64 bit extension, big-endian source value. +(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I32X4 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 32->64 bit extension, little-endian source value. +(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I32X4 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 32->64 bit extension, big-endian source value. +(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I32X4 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 32->64 bit extension, little-endian source value. +(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I32X4 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + + ;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The actual store logic for integer types is identical for the `store`, @@ -1405,41 +2454,49 @@ (rule (lower (store flags val @ (value_type $R64) addr offset)) (side_effect (istore64_impl flags val addr offset))) -;; Store 32-bit big-endian floating-point type. +;; Store 32-bit big-endian floating-point type (as vector lane). (rule (lower (store flags @ (bigendian) val @ (value_type $F32) addr offset)) - (side_effect (fpu_store32 (put_in_reg val) - (lower_address flags addr offset)))) + (side_effect (vec_store_lane $F32X4 val + (lower_address flags addr offset) 0))) -;; Store 32-bit little-endian floating-point type (z15 instruction). +;; Store 32-bit little-endian floating-point type (as vector lane). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F32 (vxrs_ext2_enabled))) addr offset)) - (side_effect (fpu_storerev32 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type $F32) addr offset)) + (side_effect (vec_store_lane_little $F32X4 val + (lower_address flags addr offset) 0))) + +;; Store 64-bit big-endian floating-point type (as vector lane). +(rule (lower (store flags @ (bigendian) + val @ (value_type $F64) addr offset)) + (side_effect (vec_store_lane $F64X2 val + (lower_address flags addr offset) 0))) -;; Store 32-bit little-endian floating-point type (via GPR on z14). +;; Store 64-bit little-endian floating-point type (as vector lane). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F32 (vxrs_ext2_disabled))) addr offset)) - (let ((gpr Reg (mov_from_fpr32 (put_in_reg val)))) - (side_effect (storerev32 gpr (lower_address flags addr offset))))) + val @ (value_type $F64) addr offset)) + (side_effect (vec_store_lane_little $F64X2 val + (lower_address flags addr offset) 0))) -;; Store 64-bit big-endian floating-point type. +;; Store 128-bit big-endian vector type. (rule (lower (store flags @ (bigendian) - val @ (value_type $F64) addr offset)) - (side_effect (fpu_store64 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type (ty_vec128 ty)) addr offset)) + (side_effect (vec_store val (lower_address flags addr offset)))) -;; Store 64-bit little-endian floating-point type (z15 instruction). +;; Store 128-bit little-endian vector type (z15 instruction). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F64 (vxrs_ext2_enabled))) addr offset)) - (side_effect (fpu_storerev64 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_enabled))) addr offset)) + (side_effect (vec_storerev val (lower_address flags addr offset)))) -;; Store 64-bit little-endian floating-point type (via GPR on z14). +;; Store 128-bit little-endian vector type (via GPRs on z14). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F64 (vxrs_ext2_disabled))) addr offset)) - (let ((gpr Reg (mov_from_fpr64 (put_in_reg val)))) - (side_effect (storerev64 gpr (lower_address flags addr offset))))) + val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_disabled))) addr offset)) + (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) + (hi_addr MemArg (lower_address_bias flags addr offset 8)) + (lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg))) + (hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg)))) + (side_effect (side_effect_concat (storerev64 lo_val lo_addr) + (storerev64 hi_val hi_addr))))) ;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1947,7 +3004,7 @@ ;; Main `icmp` entry point. Generate a `ProducesBool` capturing the ;; integer comparison and immediately lower it to a 0/1 integer result. ;; In this case, it is safe to sink memory loads. -(rule (lower (has_type ty (icmp int_cc x y))) +(rule (lower (has_type (fits_in_64 ty) (icmp int_cc x y))) (lower_bool ty (icmp_val $true int_cc x y))) @@ -2033,12 +3090,36 @@ (rule (icmpu_val $true x @ (value_type (fits_in_64 ty)) (sinkable_uload32 y)) (icmpu_mem_zext32 ty x (sink_uload32 y))) +;; Vector `icmp` produces a boolean vector. +;; We need to handle the various IntCC flags separately here. + +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.Equal) x y))) + (vec_cmpeq ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.NotEqual) x y))) + (vec_not ty (vec_cmpeq ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThan) x y))) + (vec_cmph ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThanOrEqual) x y))) + (vec_not ty (vec_cmph ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThan) x y))) + (vec_cmph ty y x)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (vec_not ty (vec_cmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThan) x y))) + (vec_cmphl ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (vec_not ty (vec_cmphl ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThan) x y))) + (vec_cmphl ty y x)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (vec_not ty (vec_cmphl ty y x))) + ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Main `fcmp` entry point. Generate a `ProducesBool` capturing the ;; integer comparison and immediately lower it to a 0/1 integer result. -(rule (lower (has_type ty (fcmp float_cc x y))) +(rule (lower (has_type (fits_in_64 ty) (fcmp float_cc x y))) (lower_bool ty (fcmp_val float_cc x y))) ;; Return a `ProducesBool` to implement any floating-point comparison. @@ -2047,6 +3128,217 @@ (bool (fcmp_reg ty x y) (floatcc_as_cond float_cc))) +;; Vector `fcmp` produces a boolean vector. +;; We need to handle the various FloatCC flags separately here. + +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Equal) x y))) + (vec_fcmpeq ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.NotEqual) x y))) + (vec_not ty (vec_fcmpeq ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThan) x y))) + (vec_fcmph ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (vec_not ty (vec_fcmph ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (vec_fcmphe ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (vec_not ty (vec_fcmphe ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThan) x y))) + (vec_fcmph ty y x)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (vec_not ty (vec_fcmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThanOrEqual) x y))) + (vec_fcmphe ty y x)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (vec_not ty (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Ordered) x y))) + (vec_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Unordered) x y))) + (vec_not_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.OrderedNotEqual) x y))) + (vec_or ty (vec_fcmph ty x y) (vec_fcmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrEqual) x y))) + (vec_not_or ty (vec_fcmph ty x y) (vec_fcmph ty y x))) + + +;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Main `vall_true` entry point. Generate a `ProducesBool` capturing the +;; comparison and immediately lower it to a 0/1 integer result. +(rule (lower (has_type (fits_in_64 ty) (vall_true x))) + (lower_bool ty (vall_true_val x))) + +;; Return a `ProducesBool` to implement `vall_true`. +(decl vall_true_val (Value) ProducesBool) +(rule (vall_true_val x @ (value_type ty)) + (bool (vec_cmpeqs ty x (vec_imm ty 0)) + (floatcc_as_cond (FloatCC.Unordered)))) + +;; Short-circuit `vall_true` on the result of a `icmp`. +(rule (vall_true_val (has_type ty (icmp (IntCC.Equal) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.NotEqual) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) + +;; Short-circuit `vall_true` on the result of a `fcmp` where possible. +(rule (vall_true_val (has_type ty (fcmp (FloatCC.Equal) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThan) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) + + +;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Main `vany_true` entry point. Generate a `ProducesBool` capturing the +;; comparison and immediately lower it to a 0/1 integer result. +(rule (lower (has_type (fits_in_64 ty) (vany_true x))) + (lower_bool ty (vany_true_val x))) + +;; Return a `ProducesBool` to implement `vany_true`. +(decl vany_true_val (Value) ProducesBool) +(rule (vany_true_val x @ (value_type ty)) + (bool (vec_cmpeqs ty x (vec_imm ty 0)) + (floatcc_as_cond (FloatCC.NotEqual)))) + +;; Short-circuit `vany_true` on the result of a `icmp`. +(rule (vany_true_val (has_type ty (icmp (IntCC.Equal) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.NotEqual) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) + +;; Short-circuit `vany_true` on the result of a `fcmp` where possible. +(rule (vany_true_val (has_type ty (fcmp (FloatCC.Equal) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThan) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) + + +;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56 + 64 72 80 88 96 104 112 120)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 0 16 32 48 64 80 96 112)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 0 32 64 96)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 128 128 0 64)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + ;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index 2c87621aae32..df93c47023dd 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -43,15 +43,28 @@ impl LowerBackend for S390xBackend { | Opcode::Bconst | Opcode::F32const | Opcode::F64const + | Opcode::Vconst | Opcode::Null | Opcode::Iadd | Opcode::IaddIfcout | Opcode::Isub + | Opcode::UaddSat + | Opcode::SaddSat + | Opcode::UsubSat + | Opcode::SsubSat + | Opcode::IaddPairwise + | Opcode::Imin + | Opcode::Umin + | Opcode::Imax + | Opcode::Umax + | Opcode::AvgRound | Opcode::Iabs | Opcode::Ineg | Opcode::Imul | Opcode::Umulhi | Opcode::Smulhi + | Opcode::WideningPairwiseDotProductS + | Opcode::SqmulRoundSat | Opcode::Udiv | Opcode::Urem | Opcode::Sdiv @@ -64,6 +77,13 @@ impl LowerBackend for S390xBackend { | Opcode::Ireduce | Opcode::Uextend | Opcode::Sextend + | Opcode::Snarrow + | Opcode::Unarrow + | Opcode::Uunarrow + | Opcode::SwidenLow + | Opcode::SwidenHigh + | Opcode::UwidenLow + | Opcode::UwidenHigh | Opcode::Bnot | Opcode::Band | Opcode::Bor @@ -72,6 +92,7 @@ impl LowerBackend for S390xBackend { | Opcode::BorNot | Opcode::BxorNot | Opcode::Bitselect + | Opcode::Vselect | Opcode::Breduce | Opcode::Bextend | Opcode::Bmask @@ -86,11 +107,15 @@ impl LowerBackend for S390xBackend { | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax + | Opcode::FminPseudo + | Opcode::FmaxPseudo | Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote + | Opcode::FvpromoteLow + | Opcode::Fvdemote | Opcode::Ceil | Opcode::Floor | Opcode::Trunc @@ -99,11 +124,20 @@ impl LowerBackend for S390xBackend { | Opcode::Fcopysign | Opcode::FcvtFromUint | Opcode::FcvtFromSint + | Opcode::FcvtLowFromSint | Opcode::FcvtToUint | Opcode::FcvtToSint | Opcode::FcvtToUintSat | Opcode::FcvtToSintSat + | Opcode::Splat + | Opcode::Swizzle + | Opcode::Shuffle + | Opcode::Insertlane + | Opcode::Extractlane + | Opcode::ScalarToVector + | Opcode::VhighBits | Opcode::Bitcast + | Opcode::RawBitcast | Opcode::Load | Opcode::Uload8 | Opcode::Sload8 @@ -111,6 +145,12 @@ impl LowerBackend for S390xBackend { | Opcode::Sload16 | Opcode::Uload32 | Opcode::Sload32 + | Opcode::Uload8x8 + | Opcode::Sload8x8 + | Opcode::Uload16x4 + | Opcode::Sload16x4 + | Opcode::Uload32x2 + | Opcode::Sload32x2 | Opcode::Store | Opcode::Istore8 | Opcode::Istore16 @@ -122,6 +162,8 @@ impl LowerBackend for S390xBackend { | Opcode::Fence | Opcode::Icmp | Opcode::Fcmp + | Opcode::VanyTrue + | Opcode::VallTrue | Opcode::IsNull | Opcode::IsInvalid | Opcode::Select @@ -147,57 +189,15 @@ impl LowerBackend for S390xBackend { ) } - Opcode::UaddSat - | Opcode::SaddSat - | Opcode::UsubSat - | Opcode::SsubSat - | Opcode::Bitrev - | Opcode::FcvtLowFromSint + Opcode::Bitrev | Opcode::ConstAddr | Opcode::TlsValue | Opcode::GetPinnedReg | Opcode::SetPinnedReg | Opcode::Isplit | Opcode::Iconcat - | Opcode::RawBitcast - | Opcode::Splat - | Opcode::Swizzle - | Opcode::Insertlane - | Opcode::Extractlane - | Opcode::Imin - | Opcode::Umin - | Opcode::Imax - | Opcode::Umax - | Opcode::AvgRound - | Opcode::FminPseudo - | Opcode::FmaxPseudo - | Opcode::Uload8x8 - | Opcode::Sload8x8 - | Opcode::Uload16x4 - | Opcode::Sload16x4 - | Opcode::Uload32x2 - | Opcode::Sload32x2 - | Opcode::Vconst - | Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat - | Opcode::Vselect - | Opcode::VanyTrue - | Opcode::VallTrue - | Opcode::VhighBits - | Opcode::ScalarToVector - | Opcode::Snarrow - | Opcode::Unarrow - | Opcode::Uunarrow - | Opcode::SwidenLow - | Opcode::SwidenHigh - | Opcode::UwidenLow - | Opcode::UwidenHigh - | Opcode::WideningPairwiseDotProductS - | Opcode::SqmulRoundSat - | Opcode::FvpromoteLow - | Opcode::Fvdemote - | Opcode::IaddPairwise | Opcode::DynamicStackLoad | Opcode::DynamicStackStore | Opcode::DynamicStackAddr diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index 2d41c6a88adc..4db95d40afb5 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -6,7 +6,7 @@ pub mod generated_code; // Types that the generated ISLE code uses via `use super::*`. use crate::isa::s390x::abi::S390xMachineDeps; use crate::isa::s390x::inst::{ - stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, + stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, UImm12, UImm16Shifted, UImm32Shifted, }; use crate::isa::s390x::settings::Flags as IsaFlags; @@ -91,6 +91,8 @@ where defs, clobbers, opcode: *opcode, + caller_callconv: self.lower_ctx.abi().call_conv(), + callee_callconv: abi.call_conv(), }) } @@ -102,6 +104,8 @@ where defs, clobbers, opcode: *opcode, + caller_callconv: self.lower_ctx.abi().call_conv(), + callee_callconv: abi.call_conv(), }) } @@ -195,6 +199,46 @@ where } } + #[inline] + fn u64_pair_split(&mut self, n: u128) -> (u64, u64) { + ((n >> 64) as u64, n as u64) + } + + #[inline] + fn u64_pair_concat(&mut self, hi: u64, lo: u64) -> u128 { + (hi as u128) << 64 | (lo as u128) + } + + #[inline] + fn u32_pair_split(&mut self, n: u64) -> (u32, u32) { + ((n >> 32) as u32, n as u32) + } + + #[inline] + fn u32_pair_concat(&mut self, hi: u32, lo: u32) -> u64 { + (hi as u64) << 32 | (lo as u64) + } + + #[inline] + fn u16_pair_split(&mut self, n: u32) -> (u16, u16) { + ((n >> 16) as u16, n as u16) + } + + #[inline] + fn u16_pair_concat(&mut self, hi: u16, lo: u16) -> u32 { + (hi as u32) << 16 | (lo as u32) + } + + #[inline] + fn u8_pair_split(&mut self, n: u16) -> (u8, u8) { + ((n >> 8) as u8, n as u8) + } + + #[inline] + fn u8_pair_concat(&mut self, hi: u8, lo: u8) -> u16 { + (hi as u16) << 8 | (lo as u16) + } + #[inline] fn u8_as_u16(&mut self, n: u8) -> u16 { n as u16 @@ -248,6 +292,15 @@ where } } + #[inline] + fn i16_from_u32(&mut self, n: u32) -> Option { + if let Ok(imm) = i16::try_from(n as i32) { + Some(imm) + } else { + None + } + } + #[inline] fn uimm32shifted_from_u64(&mut self, n: u64) -> Option { UImm32Shifted::maybe_from_u64(n) @@ -258,11 +311,49 @@ where UImm16Shifted::maybe_from_u64(n) } + #[inline] + fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 { + ty.lane_count() as u8 - 1 - idx + } + + #[inline] + fn lane_byte_mask(&mut self, ty: Type, idx: u8) -> u16 { + let lane_bytes = (ty.lane_bits() / 8) as u8; + let lane_mask = (1u16 << lane_bytes) - 1; + lane_mask << (16 - ((idx + 1) * lane_bytes)) + } + + #[inline] + fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) { + let bytes = idx.to_be_bytes(); + let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16); + let bytes = bytes.map(|x| { + if x < 16 { + 15 - x + } else if x < 32 { + 47 - x + } else { + 128 + } + }); + let permute_mask = u128::from_be_bytes(bytes); + (permute_mask, and_mask) + } + #[inline] fn u64_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; let constant = self.lower_ctx.get_constant(inst)?; - Some(constant) + let ty = self.lower_ctx.output_ty(inst, 0); + Some(zero_extend_to_u64(constant, self.ty_bits(ty).unwrap())) + } + + #[inline] + fn u64_from_inverted_value(&mut self, val: Value) -> Option { + let inst = self.lower_ctx.dfg().value_def(val).inst()?; + let constant = self.lower_ctx.get_constant(inst)?; + let ty = self.lower_ctx.output_ty(inst, 0); + Some(zero_extend_to_u64(!constant, self.ty_bits(ty).unwrap())) } #[inline] @@ -349,22 +440,22 @@ where #[inline] fn uimm16shifted_from_inverted_value(&mut self, val: Value) -> Option { - let constant = self.u64_from_value(val)?; - let imm = UImm16Shifted::maybe_from_u64(!constant)?; + let constant = self.u64_from_inverted_value(val)?; + let imm = UImm16Shifted::maybe_from_u64(constant)?; Some(imm.negate_bits()) } #[inline] fn uimm32shifted_from_inverted_value(&mut self, val: Value) -> Option { - let constant = self.u64_from_value(val)?; - let imm = UImm32Shifted::maybe_from_u64(!constant)?; + let constant = self.u64_from_inverted_value(val)?; + let imm = UImm32Shifted::maybe_from_u64(constant)?; Some(imm.negate_bits()) } #[inline] fn mask_amt_imm(&mut self, ty: Type, amt: i64) -> u8 { - let mask = self.ty_bits(ty).unwrap() - 1; - (amt as u8) & mask + let mask = ty.lane_bits() - 1; + (amt as u8) & (mask as u8) } #[inline] @@ -498,13 +589,18 @@ where } #[inline] - fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, flags: MemFlags) -> MemArg { - MemArg::reg_plus_reg(x, y, flags) + fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, bias: u8, flags: MemFlags) -> MemArg { + MemArg::BXD12 { + base: x, + index: y, + disp: UImm12::maybe_from_u64(bias as u64).unwrap(), + flags, + } } #[inline] - fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, flags: MemFlags) -> MemArg { - MemArg::reg_plus_off(reg, off, flags) + fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, bias: u8, flags: MemFlags) -> MemArg { + MemArg::reg_plus_off(reg, off + (bias as i64), flags) } #[inline] @@ -586,6 +682,17 @@ where } } +/// Zero-extend the low `from_bits` bits of `value` to a full u64. +#[inline] +fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 { + assert!(from_bits <= 64); + if from_bits >= 64 { + value + } else { + value & ((1u64 << from_bits) - 1) + } +} + /// Sign-extend the low `from_bits` bits of `value` to a full u64. #[inline] fn sign_extend_to_u64(value: u64, from_bits: u8) -> u64 { diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs index 45bf8884b916..718871e82708 100644 --- a/cranelift/codegen/src/machinst/abi_impl.rs +++ b/cranelift/codegen/src/machinst/abi_impl.rs @@ -696,6 +696,11 @@ impl ABISig { let ret_arg = self.stack_ret_arg?; Some(self.args[ret_arg].clone()) } + + /// Get calling convention used. + pub fn call_conv(&self) -> isa::CallConv { + self.call_conv + } } /// ABI object for a function body. diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 28005863df33..c1bdc418bd6a 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -7,7 +7,8 @@ use std::cell::Cell; pub use super::MachLabel; pub use crate::ir::{ - ArgumentExtension, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, SigRef, StackSlot, + ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate, + SigRef, StackSlot, }; pub use crate::isa::unwind::UnwindInst; pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable}; @@ -539,6 +540,18 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn u128_from_immediate(&mut self, imm: Immediate) -> Option { + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + Some(u128::from_le_bytes(bytes.try_into().ok()?)) + } + + #[inline] + fn u128_from_constant(&mut self, constant: Constant) -> Option { + let bytes = self.lower_ctx.get_constant_data(constant).as_slice(); + Some(u128::from_le_bytes(bytes.try_into().ok()?)) + } + fn nonzero_u64_from_imm64(&mut self, val: Imm64) -> Option { match val.bits() { 0 => None, diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index a496f9a61657..42a4f74c240b 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -12,8 +12,8 @@ use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; use crate::ir::{ types::{FFLAGS, IFLAGS}, ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, ExternalName, Function, - GlobalValue, GlobalValueData, Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc, - Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart, + GlobalValue, GlobalValueData, Immediate, Inst, InstructionData, MemFlags, Opcode, Signature, + SourceLoc, Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart, }; use crate::machinst::{ non_writable_value_regs, writable_value_regs, ABICallee, BlockIndex, BlockLoweringOrder, @@ -167,6 +167,8 @@ pub trait LowerCtx { /// for the input produced by the sunk instruction), otherwise the /// side-effect will occur twice. fn sink_inst(&mut self, ir_inst: Inst); + /// Retrieve immediate data given a handle. + fn get_immediate_data(&self, imm: Immediate) -> &ConstantData; /// Retrieve constant data given a handle. fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData; /// Indicate that a constant should be emitted. @@ -1448,6 +1450,10 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { self.inst_sunk.insert(ir_inst); } + fn get_immediate_data(&self, imm: Immediate) -> &ConstantData { + self.f.dfg.immediates.get(imm).unwrap() + } + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData { self.f.dfg.constants.get(constant_handle) } diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index ccaef3234197..abbac35bb922 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -657,6 +657,17 @@ (decl reloc_distance_near () RelocDistance) (extern extractor reloc_distance_near reloc_distance_near) +;; Accessor for `Immediate` as u128. + +(decl u128_from_immediate (u128) Immediate) +(extern extractor u128_from_immediate u128_from_immediate) + +;; Accessor for `Constant` as u128. + +(decl u128_from_constant (u128) Constant) +(extern extractor u128_from_constant u128_from_constant) + + ;;;; Helpers for tail recursion loops ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; A range of integers to loop through. diff --git a/cranelift/filetests/filetests/isa/s390x/condops.clif b/cranelift/filetests/filetests/isa/s390x/condops.clif index 5a73e0ae1e31..d84cd49c0500 100644 --- a/cranelift/filetests/filetests/isa/s390x/condops.clif +++ b/cranelift/filetests/filetests/isa/s390x/condops.clif @@ -43,3 +43,18 @@ block0(v0: i32, v1: i8, v2: i8): ; locre %r2, %r3 ; br %r14 +function %i(i32, i8x16, i8x16) -> i8x16 { +block0(v0: i32, v1: i8x16, v2: i8x16): + v3 = iconst.i32 42 + v4 = icmp.i32 eq v0, v3 + v5 = select.i8x16 v4, v1, v2 + return v5 +} + +; block0: +; vlr %v20, %v24 +; clfi %r2, 42 +; vlr %v24, %v25 +; jne 10 ; vlr %v24, %v20 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif index 4a8a84f37e6c..47e28b87d687 100644 --- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif +++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif @@ -168,6 +168,46 @@ block0(v0: f64, v1: f64): ; wfmaxdb %f0, %f0, %f2, 1 ; br %r14 +function %fmin_pseudo_f32(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; wfminsb %f0, %f0, %f2, 3 +; br %r14 + +function %fmin_pseudo_f64(f64, f64) -> f64 { +block0(v0: f64, v1: f64): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; wfmindb %f0, %f0, %f2, 3 +; br %r14 + +function %fmax_pseudo_f32(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; wfmaxsb %f0, %f0, %f2, 3 +; br %r14 + +function %fmax_pseudo_f64(f64, f64) -> f64 { +block0(v0: f64, v1: f64): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; wfmaxdb %f0, %f0, %f2, 3 +; br %r14 + function %sqrt_f32(f32) -> f32 { block0(v0: f32): v1 = sqrt v0 diff --git a/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif b/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif index 76224768bc9f..736d72b7a1da 100644 --- a/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif +++ b/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif @@ -8,7 +8,7 @@ block0(v0: i64): } ; block0: -; vlebrg %f0, 0(%r2), 0 +; vlebrg %v0, 0(%r2), 0 ; br %r14 function %load_f32_little(i64) -> f32 { @@ -18,7 +18,7 @@ block0(v0: i64): } ; block0: -; vlebrf %f0, 0(%r2), 0 +; vlebrf %v0, 0(%r2), 0 ; br %r14 function %store_f64_little(f64, i64) { @@ -28,7 +28,7 @@ block0(v0: f64, v1: i64): } ; block0: -; vstebrg %f0, 0(%r2), 0 +; vstebrg %v0, 0(%r2), 0 ; br %r14 function %store_f32_little(f32, i64) { @@ -38,6 +38,6 @@ block0(v0: f32, v1: i64): } ; block0: -; vstebrf %f0, 0(%r2), 0 +; vstebrf %v0, 0(%r2), 0 ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif new file mode 100644 index 000000000000..334c43821b8a --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif @@ -0,0 +1,824 @@ +test compile precise-output +target s390x + +function %iadd_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = iadd.i64x2 v0, v1 + return v2 +} + +; block0: +; vag %v24, %v24, %v25 +; br %r14 + +function %iadd_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd.i32x4 v0, v1 + return v2 +} + +; block0: +; vaf %v24, %v24, %v25 +; br %r14 + +function %iadd_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd.i16x8 v0, v1 + return v2 +} + +; block0: +; vah %v24, %v24, %v25 +; br %r14 + +function %iadd_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd.i8x16 v0, v1 + return v2 +} + +; block0: +; vab %v24, %v24, %v25 +; br %r14 + +function %isub_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = isub.i64x2 v0, v1 + return v2 +} + +; block0: +; vsg %v24, %v24, %v25 +; br %r14 + +function %isub_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = isub.i32x4 v0, v1 + return v2 +} + +; block0: +; vsf %v24, %v24, %v25 +; br %r14 + +function %isub_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = isub.i16x8 v0, v1 + return v2 +} + +; block0: +; vsh %v24, %v24, %v25 +; br %r14 + +function %isub_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = isub.i8x16 v0, v1 + return v2 +} + +; block0: +; vsb %v24, %v24, %v25 +; br %r14 + +function %iabs_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iabs.i64x2 v0 + return v1 +} + +; block0: +; vlpg %v24, %v24 +; br %r14 + +function %iabs_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iabs.i32x4 v0 + return v1 +} + +; block0: +; vlpf %v24, %v24 +; br %r14 + +function %iabs_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iabs.i16x8 v0 + return v1 +} + +; block0: +; vlph %v24, %v24 +; br %r14 + +function %iabs_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iabs.i8x16 v0 + return v1 +} + +; block0: +; vlpb %v24, %v24 +; br %r14 + +function %ineg_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = ineg.i64x2 v0 + return v1 +} + +; block0: +; vlcg %v24, %v24 +; br %r14 + +function %ineg_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = ineg.i32x4 v0 + return v1 +} + +; block0: +; vlcf %v24, %v24 +; br %r14 + +function %ineg_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = ineg.i16x8 v0 + return v1 +} + +; block0: +; vlch %v24, %v24 +; br %r14 + +function %ineg_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = ineg.i8x16 v0 + return v1 +} + +; block0: +; vlcb %v24, %v24 +; br %r14 + +function %umax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umax.i64x2 v0, v1 + return v2 +} + +; block0: +; vmxlg %v24, %v24, %v25 +; br %r14 + +function %umax_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umax.i32x4 v0, v1 + return v2 +} + +; block0: +; vmxlf %v24, %v24, %v25 +; br %r14 + +function %umax_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umax.i16x8 v0, v1 + return v2 +} + +; block0: +; vmxlh %v24, %v24, %v25 +; br %r14 + +function %umax_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umax.i8x16 v0, v1 + return v2 +} + +; block0: +; vmxlb %v24, %v24, %v25 +; br %r14 + +function %umin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umin.i64x2 v0, v1 + return v2 +} + +; block0: +; vmnlg %v24, %v24, %v25 +; br %r14 + +function %umin_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umin.i32x4 v0, v1 + return v2 +} + +; block0: +; vmnlf %v24, %v24, %v25 +; br %r14 + +function %umin_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umin.i16x8 v0, v1 + return v2 +} + +; block0: +; vmnlh %v24, %v24, %v25 +; br %r14 + +function %umin_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umin.i8x16 v0, v1 + return v2 +} + +; block0: +; vmnlb %v24, %v24, %v25 +; br %r14 + +function %imax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imax.i64x2 v0, v1 + return v2 +} + +; block0: +; vmxg %v24, %v24, %v25 +; br %r14 + +function %imax_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imax.i32x4 v0, v1 + return v2 +} + +; block0: +; vmxf %v24, %v24, %v25 +; br %r14 + +function %imax_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imax.i16x8 v0, v1 + return v2 +} + +; block0: +; vmxh %v24, %v24, %v25 +; br %r14 + +function %imax_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imax.i8x16 v0, v1 + return v2 +} + +; block0: +; vmxb %v24, %v24, %v25 +; br %r14 + +function %imin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imin.i64x2 v0, v1 + return v2 +} + +; block0: +; vmng %v24, %v24, %v25 +; br %r14 + +function %imin_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imin.i32x4 v0, v1 + return v2 +} + +; block0: +; vmnf %v24, %v24, %v25 +; br %r14 + +function %imin_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imin.i16x8 v0, v1 + return v2 +} + +; block0: +; vmnh %v24, %v24, %v25 +; br %r14 + +function %imin_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imin.i8x16 v0, v1 + return v2 +} + +; block0: +; vmnb %v24, %v24, %v25 +; br %r14 + +function %avg_round_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = avg_round.i64x2 v0, v1 + return v2 +} + +; block0: +; vavglg %v24, %v24, %v25 +; br %r14 + +function %avg_round_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = avg_round.i32x4 v0, v1 + return v2 +} + +; block0: +; vavglf %v24, %v24, %v25 +; br %r14 + +function %avg_round_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = avg_round.i16x8 v0, v1 + return v2 +} + +; block0: +; vavglh %v24, %v24, %v25 +; br %r14 + +function %avg_round_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round.i8x16 v0, v1 + return v2 +} + +; block0: +; vavglb %v24, %v24, %v25 +; br %r14 + +function %uadd_sat64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = uadd_sat.i64x2 v0, v1 + return v2 +} + +; block0: +; vag %v5, %v24, %v25 +; vchlg %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = uadd_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vaf %v5, %v24, %v25 +; vchlf %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = uadd_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vah %v5, %v24, %v25 +; vchlh %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = uadd_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vab %v5, %v24, %v25 +; vchlb %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %sadd_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = sadd_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; vag %v17, %v5, %v7 +; vuplf %v19, %v24 +; vuplf %v21, %v25 +; vag %v23, %v19, %v21 +; vpksg %v24, %v17, %v23 +; br %r14 + +function %sadd_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sadd_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vaf %v17, %v5, %v7 +; vuplh %v19, %v24 +; vuplh %v21, %v25 +; vaf %v23, %v19, %v21 +; vpksf %v24, %v17, %v23 +; br %r14 + +function %sadd_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = sadd_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vuphb %v5, %v24 +; vuphb %v7, %v25 +; vah %v17, %v5, %v7 +; vuplb %v19, %v24 +; vuplb %v21, %v25 +; vah %v23, %v19, %v21 +; vpksh %v24, %v17, %v23 +; br %r14 + +function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise.i32x4 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 32 +; vsrlb %v7, %v25, %v5 +; vaf %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vaf %v21, %v24, %v19 +; vpkg %v24, %v17, %v21 +; br %r14 + +function %usub_sat64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = usub_sat.i64x2 v0, v1 + return v2 +} + +; block0: +; vsg %v5, %v24, %v25 +; vchlg %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = usub_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vsf %v5, %v24, %v25 +; vchlf %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = usub_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vsh %v5, %v24, %v25 +; vchlh %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = usub_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vsb %v5, %v24, %v25 +; vchlb %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %ssub_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = ssub_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; vsg %v17, %v5, %v7 +; vuplf %v19, %v24 +; vuplf %v21, %v25 +; vsg %v23, %v19, %v21 +; vpksg %v24, %v17, %v23 +; br %r14 + +function %ssub_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = ssub_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vsf %v17, %v5, %v7 +; vuplh %v19, %v24 +; vuplh %v21, %v25 +; vsf %v23, %v19, %v21 +; vpksf %v24, %v17, %v23 +; br %r14 + +function %ssub_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = ssub_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vuphb %v5, %v24 +; vuphb %v7, %v25 +; vsh %v17, %v5, %v7 +; vuplb %v19, %v24 +; vuplb %v21, %v25 +; vsh %v23, %v19, %v21 +; vpksh %v24, %v17, %v23 +; br %r14 + +function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise.i32x4 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 32 +; vsrlb %v7, %v25, %v5 +; vaf %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vaf %v21, %v24, %v19 +; vpkg %v24, %v17, %v21 +; br %r14 + +function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd_pairwise.i16x8 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 16 +; vsrlb %v7, %v25, %v5 +; vah %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vah %v21, %v24, %v19 +; vpkf %v24, %v17, %v21 +; br %r14 + +function %iadd_pairwise_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd_pairwise.i8x16 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 8 +; vsrlb %v7, %v25, %v5 +; vab %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vab %v21, %v24, %v19 +; vpkh %v24, %v17, %v21 +; br %r14 + +function %imul_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imul.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r5, %v25, 0 +; msgr %r3, %r5 +; vlgvg %r5, %v24, 1 +; vlgvg %r4, %v25, 1 +; msgr %r5, %r4 +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %imul_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imul.i32x4 v0, v1 + return v2 +} + +; block0: +; vmlf %v24, %v24, %v25 +; br %r14 + +function %imul_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imul.i16x8 v0, v1 + return v2 +} + +; block0: +; vmlhw %v24, %v24, %v25 +; br %r14 + +function %imul_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imul.i8x16 v0, v1 + return v2 +} + +; block0: +; vmlb %v24, %v24, %v25 +; br %r14 + +function %umulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umulhi.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r1, %v25, 0 +; mlgr %r0, %r3 +; lgr %r2, %r0 +; vlgvg %r3, %v24, 1 +; vlgvg %r1, %v25, 1 +; mlgr %r0, %r3 +; vlvgp %v24, %r2, %r0 +; br %r14 + +function %umulhi_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umulhi.i32x4 v0, v1 + return v2 +} + +; block0: +; vmlhf %v24, %v24, %v25 +; br %r14 + +function %umulhi_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umulhi.i16x8 v0, v1 + return v2 +} + +; block0: +; vmlhh %v24, %v24, %v25 +; br %r14 + +function %umulhi_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umulhi.i8x16 v0, v1 + return v2 +} + +; block0: +; vmlhb %v24, %v24, %v25 +; br %r14 + +function %smulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = smulhi.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r5, %v25, 0 +; mgrk %r0, %r3, %r5 +; lgr %r3, %r0 +; vlgvg %r2, %v24, 1 +; vlgvg %r4, %v25, 1 +; mgrk %r0, %r2, %r4 +; lgr %r4, %r3 +; vlvgp %v24, %r4, %r0 +; br %r14 + +function %smulhi_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = smulhi.i32x4 v0, v1 + return v2 +} + +; block0: +; vmhf %v24, %v24, %v25 +; br %r14 + +function %smulhi_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = smulhi.i16x8 v0, v1 + return v2 +} + +; block0: +; vmhh %v24, %v24, %v25 +; br %r14 + +function %smulhi_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = smulhi.i8x16 v0, v1 + return v2 +} + +; block0: +; vmhb %v24, %v24, %v25 +; br %r14 + +function %widening_pairwise_dot_product_s_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = widening_pairwise_dot_product_s v0, v1 + return v2 +} + +; block0: +; vmeh %v5, %v24, %v25 +; vmoh %v7, %v24, %v25 +; vaf %v24, %v5, %v7 +; br %r14 + +function %sqmul_round_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sqmul_round_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vmlf %v17, %v5, %v7 +; vgmf %v19, 17, 17 +; vaf %v21, %v17, %v19 +; vesraf %v23, %v21, 15 +; vuplh %v26, %v24 +; vuplh %v27, %v25 +; vmlf %v29, %v26, %v27 +; vgmf %v31, 17, 17 +; vaf %v1, %v29, %v31 +; vesraf %v3, %v1, 15 +; vpksf %v24, %v23, %v3 +; br %r14 + +function %sqmul_round_sat(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = sqmul_round_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; lgdr %r3, %f5 +; lgdr %r5, %f7 +; msgr %r3, %r5 +; vlgvg %r5, %v5, 1 +; vlgvg %r4, %v7, 1 +; msgr %r5, %r4 +; vlvgp %v29, %r3, %r5 +; vgmg %v31, 33, 33 +; vag %v1, %v29, %v31 +; vesrag %v3, %v1, 31 +; vuplf %v5, %v24 +; vuplf %v7, %v25 +; lgdr %r3, %f5 +; lgdr %r5, %f7 +; msgr %r3, %r5 +; vlgvg %r5, %v5, 1 +; vlgvg %r4, %v7, 1 +; msgr %r5, %r4 +; vlvgp %v29, %r3, %r5 +; vgmg %v31, 33, 33 +; vag %v1, %v29, %v31 +; vesrag %v4, %v1, 31 +; vpksg %v24, %v3, %v4 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif new file mode 100644 index 000000000000..a5cff95c475c --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif @@ -0,0 +1,43 @@ +test compile precise-output +target s390x + +function %popcnt_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = popcnt.i64x2 v0 + return v1 +} + +; block0: +; vpopctg %v24, %v24 +; br %r14 + +function %popcnt_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = popcnt.i32x4 v0 + return v1 +} + +; block0: +; vpopctf %v24, %v24 +; br %r14 + +function %popcnt_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = popcnt.i16x8 v0 + return v1 +} + +; block0: +; vpopcth %v24, %v24 +; br %r14 + +function %popcnt_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = popcnt.i8x16 v0 + return v1 +} + +; block0: +; vpopctb %v24, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif new file mode 100644 index 000000000000..8722a78703b3 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif @@ -0,0 +1,364 @@ + +test compile precise-output +target s390x + +function %band_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = band.i64x2 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = band.i32x4 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = band.i16x8 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = band.i8x16 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %bor_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bor.i64x2 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bor.i32x4 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bor.i16x8 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bor.i8x16 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bxor_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bxor.i64x2 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bxor.i32x4 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bxor.i16x8 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bxor.i8x16 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %band_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = band_not.i64x2 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = band_not.i32x4 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = band_not.i16x8 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = band_not.i8x16 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bor_not.i64x2 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bor_not.i32x4 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bor_not.i16x8 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bor_not.i8x16 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bxor_not.i64x2 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bxor_not.i32x4 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bxor_not.i16x8 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bxor_not.i8x16 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bnot_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = bnot.i64x2 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = bnot.i32x4 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = bnot.i16x8 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = bnot.i8x16 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bitselect.i64x2 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4, v2: i32x4): + v3 = bitselect.i32x4 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8, v2: i16x8): + v3 = bitselect.i16x8 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i8x16): + v3 = bitselect.i8x16 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i64x2(b64x2, i64x2, i64x2) -> i64x2 { +block0(v0: b64x2, v1: i64x2, v2: i64x2): + v3 = vselect.i64x2 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i32x4(b32x4, i32x4, i32x4) -> i32x4 { +block0(v0: b32x4, v1: i32x4, v2: i32x4): + v3 = vselect.i32x4 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 { +block0(v0: b16x8, v1: i16x8, v2: i16x8): + v3 = vselect.i16x8 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i8x16(b8x16, i8x16, i8x16) -> i8x16 { +block0(v0: b8x16, v1: i8x16, v2: i8x16): + v3 = vselect.i8x16 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-constants.clif b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif new file mode 100644 index 000000000000..b5a6969f2b3e --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif @@ -0,0 +1,213 @@ +test compile precise-output +target s390x + +function %vconst_i64x2_zero() -> i64x2 { +block0: + v1 = vconst.i64x2 [0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i64x2_splat1() -> i64x2 { +block0: + v1 = vconst.i64x2 [32767 32767] + return v1 +} + +; block0: +; vrepig %v24, 32767 +; br %r14 + +function %vconst_i64x2_splat2() -> i64x2 { +block0: + v1 = vconst.i64x2 [-32768 -32768] + return v1 +} + +; block0: +; vrepig %v24, -32768 +; br %r14 + +function %vconst_i64x2_splat3() -> i64x2 { +block0: + v1 = vconst.i64x2 [32768 32768] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000000008000 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_splat4() -> i64x2 { +block0: + v1 = vconst.i64x2 [-32769 -32769] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0xffffffffffff7fff ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_mixed() -> i64x2 { +block0: + v1 = vconst.i64x2 [1 2] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000000000000020000000000000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_zero() -> i32x4 { +block0: + v1 = vconst.i32x4 [0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i32x4_splat1() -> i32x4 { +block0: + v1 = vconst.i32x4 [32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepif %v24, 32767 +; br %r14 + +function %vconst_i32x4_splat2() -> i32x4 { +block0: + v1 = vconst.i32x4 [-32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepif %v24, -32768 +; br %r14 + +function %vconst_i32x4_splat3() -> i32x4 { +block0: + v1 = vconst.i32x4 [32768 32768 32768 32768] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0x00008000 ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat4() -> i32x4 { +block0: + v1 = vconst.i32x4 [-32769 -32769 -32769 -32769] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0xffff7fff ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat_i64() -> i32x4 { +block0: + v1 = vconst.i32x4 [1 2 1 2] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000200000001 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_mixed() -> i32x4 { +block0: + v1 = vconst.i32x4 [1 2 3 4] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000004000000030000000200000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i16x8_zero() -> i16x8 { +block0: + v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i16x8_splat1() -> i16x8 { +block0: + v1 = vconst.i16x8 [32767 32767 32767 32767 32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepih %v24, 32767 +; br %r14 + +function %vconst_i16x8_splat2() -> i16x8 { +block0: + v1 = vconst.i16x8 [-32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepih %v24, -32768 +; br %r14 + +function %vconst_i16x8_mixed() -> i16x8 { +block0: + v1 = vconst.i16x8 [1 2 3 4 5 6 7 8] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00080007000600050004000300020001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i8x16_zero() -> i8x16 { +block0: + v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i8x16_splat1() -> i8x16 { +block0: + v1 = vconst.i8x16 [127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127] + return v1 +} + +; block0: +; vrepib %v24, 127 +; br %r14 + +function %vconst_i8x16_splat2() -> i8x16 { +block0: + v1 = vconst.i8x16 [-128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128] + return v1 +} + +; block0: +; vrepib %v24, 128 +; br %r14 + +function %vconst_i8x16_mixed() -> i8x16 { +block0: + v1 = vconst.i8x16 [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x100f0e0d0c0b0a090807060504030201 ; vl %v24, 0(%r1) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif new file mode 100644 index 000000000000..b137c8cd214b --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif @@ -0,0 +1,222 @@ +test compile precise-output +target s390x + +function %snarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = snarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpksg %v24, %v25, %v24 +; br %r14 + +function %snarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = snarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpksf %v24, %v25, %v24 +; br %r14 + +function %snarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = snarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpksh %v24, %v25, %v24 +; br %r14 + +function %unarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = unarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxg %v7, %v25, %v5 +; vmxg %v17, %v24, %v5 +; vpklsg %v24, %v7, %v17 +; br %r14 + +function %unarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxf %v7, %v25, %v5 +; vmxf %v17, %v24, %v5 +; vpklsf %v24, %v7, %v17 +; br %r14 + +function %unarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = unarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxh %v7, %v25, %v5 +; vmxh %v17, %v24, %v5 +; vpklsh %v24, %v7, %v17 +; br %r14 + +function %uunarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = uunarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpklsg %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = uunarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpklsf %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = uunarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpklsh %v24, %v25, %v24 +; br %r14 + +function %swiden_low_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_low.i32x4 v0 + return v1 +} + +; block0: +; vuplf %v24, %v24 +; br %r14 + +function %swiden_low_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_low.i16x8 v0 + return v1 +} + +; block0: +; vuplh %v24, %v24 +; br %r14 + +function %swiden_low_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_low.i8x16 v0 + return v1 +} + +; block0: +; vuplb %v24, %v24 +; br %r14 + +function %swiden_high_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuphf %v24, %v24 +; br %r14 + +function %swiden_high_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuphh %v24, %v24 +; br %r14 + +function %swiden_high_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuphb %v24, %v24 +; br %r14 + +function %uwiden_low_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_low.i32x4 v0 + return v1 +} + +; block0: +; vupllf %v24, %v24 +; br %r14 + +function %uwiden_low_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_low.i16x8 v0 + return v1 +} + +; block0: +; vupllh %v24, %v24 +; br %r14 + +function %uwiden_low_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low.i8x16 v0 + return v1 +} + +; block0: +; vupllb %v24, %v24 +; br %r14 + +function %uwiden_high_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuplhf %v24, %v24 +; br %r14 + +function %uwiden_high_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuplhh %v24, %v24 +; br %r14 + +function %uwiden_high_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuplhb %v24, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif new file mode 100644 index 000000000000..32aeab3bd15d --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif @@ -0,0 +1,309 @@ +test compile precise-output +target s390x + +function %fcmp_eq_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 eq v0, v1 + return v2 +} + +; block0: +; vfcedb %v24, %v24, %v25 +; br %r14 + +function %fcmp_ne_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ne v0, v1 + return v2 +} + +; block0: +; vfcedb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_gt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 gt v0, v1 + return v2 +} + +; block0: +; vfchdb %v24, %v24, %v25 +; br %r14 + +function %fcmp_lt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 lt v0, v1 + return v2 +} + +; block0: +; vfchdb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ge_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ge v0, v1 + return v2 +} + +; block0: +; vfchedb %v24, %v24, %v25 +; br %r14 + +function %fcmp_le_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 le v0, v1 + return v2 +} + +; block0: +; vfchedb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ueq_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ueq v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vfchdb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_one_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 one v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vfchdb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_ugt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ugt v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ult_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ult v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_uge_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 uge v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ule_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ule v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ord_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ord v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vfchedb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_uno_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 uno v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vfchedb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_eq_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 eq v0, v1 + return v2 +} + +; block0: +; vfcesb %v24, %v24, %v25 +; br %r14 + +function %fcmp_ne_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ne v0, v1 + return v2 +} + +; block0: +; vfcesb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_gt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 gt v0, v1 + return v2 +} + +; block0: +; vfchsb %v24, %v24, %v25 +; br %r14 + +function %fcmp_lt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 lt v0, v1 + return v2 +} + +; block0: +; vfchsb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ge_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ge v0, v1 + return v2 +} + +; block0: +; vfchesb %v24, %v24, %v25 +; br %r14 + +function %fcmp_le_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 le v0, v1 + return v2 +} + +; block0: +; vfchesb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ueq_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ueq v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vfchsb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_one_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 one v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vfchsb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_ugt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ugt v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ult_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ult v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_uge_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 uge v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ule_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ule v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ord_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ord v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vfchesb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_uno_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 uno v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vfchesb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif new file mode 100644 index 000000000000..4c00c348d458 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif @@ -0,0 +1,90 @@ +test compile precise-output +target s390x arch13 + +function %fcvt_from_uint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; block0: +; vcelfb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_sint.f32x4 v0 + return v1 +} + +; block0: +; vcefb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_uint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_uint.f64x2 v0 + return v1 +} + +; block0: +; vcdlgb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vcdgb %v24, %v24, 0, 4 +; br %r14 + + +function %fcvt_to_uint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; block0: +; vclfeb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; block0: +; vcfeb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcesb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + +function %fcvt_to_uint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_uint_sat.i64x2 v0 + return v1 +} + +; block0: +; vclgdb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_sint_sat.i64x2 v0 + return v1 +} + +; block0: +; vcgdb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcedb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif new file mode 100644 index 000000000000..fc356d57a762 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif @@ -0,0 +1,533 @@ +test compile precise-output +target s390x + +function %vconst_f32x4_zero() -> f32x4 { +block0: + v1 = vconst.f32x4 [0x0.0 0x0.0 0x0.0 0x0.0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_f64x2_zero() -> f64x2 { +block0: + v1 = vconst.f64x2 [0x0.0 0x0.0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_f32x4_mixed() -> f32x4 { +block0: + v1 = vconst.f32x4 [0x1.0 0x2.0 0x3.0 0x4.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x4080000040400000400000003f800000 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_f64x2_mixed() -> f64x2 { +block0: + v1 = vconst.f64x2 [0x1.0 0x2.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x40000000000000003ff0000000000000 ; vl %v24, 0(%r1) +; br %r14 + +function %fadd_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fadd v0, v1 + return v2 +} + +; block0: +; vfasb %v24, %v24, %v25 +; br %r14 + +function %fadd_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fadd v0, v1 + return v2 +} + +; block0: +; vfadb %v24, %v24, %v25 +; br %r14 + +function %fsub_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fsub v0, v1 + return v2 +} + +; block0: +; vfssb %v24, %v24, %v25 +; br %r14 + +function %fsub_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fsub v0, v1 + return v2 +} + +; block0: +; vfsdb %v24, %v24, %v25 +; br %r14 + +function %fmul_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmul v0, v1 + return v2 +} + +; block0: +; vfmsb %v24, %v24, %v25 +; br %r14 + +function %fmul_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmul v0, v1 + return v2 +} + +; block0: +; vfmdb %v24, %v24, %v25 +; br %r14 + +function %fdiv_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fdiv v0, v1 + return v2 +} + +; block0: +; vfdsb %v24, %v24, %v25 +; br %r14 + +function %fdiv_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fdiv v0, v1 + return v2 +} + +; block0: +; vfddb %v24, %v24, %v25 +; br %r14 + +function %fmin_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin v0, v1 + return v2 +} + +; block0: +; vfminsb %v24, %v24, %v25, 1 +; br %r14 + +function %fmin_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin v0, v1 + return v2 +} + +; block0: +; vfmindb %v24, %v24, %v25, 1 +; br %r14 + +function %fmax_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmax v0, v1 + return v2 +} + +; block0: +; vfmaxsb %v24, %v24, %v25, 1 +; br %r14 + +function %fmax_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax v0, v1 + return v2 +} + +; block0: +; vfmaxdb %v24, %v24, %v25, 1 +; br %r14 + +function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; vfminsb %v24, %v24, %v25, 3 +; br %r14 + +function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; vfmindb %v24, %v24, %v25, 3 +; br %r14 + +function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; vfmaxsb %v24, %v24, %v25, 3 +; br %r14 + +function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; vfmaxdb %v24, %v24, %v25, 3 +; br %r14 + +function %sqrt_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = sqrt v0 + return v1 +} + +; block0: +; vfsqsb %v24, %v24 +; br %r14 + +function %sqrt_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = sqrt v0 + return v1 +} + +; block0: +; vfsqdb %v24, %v24 +; br %r14 + +function %fabs_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fabs v0 + return v1 +} + +; block0: +; vflpsb %v24, %v24 +; br %r14 + +function %fabs_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fabs v0 + return v1 +} + +; block0: +; vflpdb %v24, %v24 +; br %r14 + +function %fneg_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fneg v0 + return v1 +} + +; block0: +; vflcsb %v24, %v24 +; br %r14 + +function %fneg_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fneg v0 + return v1 +} + +; block0: +; vflcdb %v24, %v24 +; br %r14 + +function %fvpromote_low_f32x4(f32x4) -> f64x2 { +block0(v0: f32x4): + v1 = fvpromote_low v0 + return v1 +} + +; block0: +; vmrlf %v3, %v24, %v24 +; vldeb %v24, %v3 +; br %r14 + +function %fvdemote_f64x2(f64x2) -> f32x4 { +block0(v0: f64x2): + v1 = fvdemote v0 + return v1 +} + +; block0: +; vledb %v3, %v24, 0, 0 +; vgbm %v5, 0 +; bras %r1, 20 ; data.u128 0x10101010101010100001020308090a0b ; vl %v7, 0(%r1) +; vperm %v24, %v3, %v5, %v7 +; br %r14 + +function %ceil_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 6 +; br %r14 + +function %ceil_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 6 +; br %r14 + +function %floor_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = floor v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 7 +; br %r14 + +function %floor_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = floor v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 7 +; br %r14 + +function %trunc_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = trunc v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 5 +; br %r14 + +function %trunc_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = trunc v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 5 +; br %r14 + +function %nearest_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = nearest v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 4 +; br %r14 + +function %nearest_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = nearest v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 4 +; br %r14 + +function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; vfmasb %v24, %v24, %v25, %v26 +; br %r14 + +function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; vfmadb %v24, %v24, %v25, %v26 +; br %r14 + +function %fcopysign_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; vgmf %v5, 1, 31 +; vsel %v24, %v24, %v25, %v5 +; br %r14 + +function %fcopysign_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; vgmg %v5, 1, 63 +; vsel %v24, %v24, %v25, %v5 +; br %r14 + +function %fcvt_from_uint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; block0: +; vuplhf %v3, %v24 +; vcdlgb %v5, %v3, 0, 3 +; vledb %v7, %v5, 0, 4 +; vupllf %v17, %v24 +; vcdlgb %v19, %v17, 0, 3 +; vledb %v21, %v19, 0, 4 +; bras %r1, 20 ; data.u128 0x0001020308090a0b1011121318191a1b ; vl %v23, 0(%r1) +; vperm %v24, %v7, %v21, %v23 +; br %r14 + +function %fcvt_from_sint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_sint.f32x4 v0 + return v1 +} + +; block0: +; vuphf %v3, %v24 +; vcdgb %v5, %v3, 0, 3 +; vledb %v7, %v5, 0, 4 +; vuplf %v17, %v24 +; vcdgb %v19, %v17, 0, 3 +; vledb %v21, %v19, 0, 4 +; bras %r1, 20 ; data.u128 0x0001020308090a0b1011121318191a1b ; vl %v23, 0(%r1) +; vperm %v24, %v7, %v21, %v23 +; br %r14 + +function %fcvt_from_uint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_uint.f64x2 v0 + return v1 +} + +; block0: +; vcdlgb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vcdgb %v24, %v24, 0, 4 +; br %r14 + + +function %fcvt_low_from_sint_i32x4_f64x2(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = fcvt_low_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vuplf %v3, %v24 +; vcdgb %v24, %v3, 0, 4 +; br %r14 + +function %fcvt_to_uint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; block0: +; vmrhf %v3, %v24, %v24 +; vldeb %v5, %v3 +; vclgdb %v7, %v5, 0, 5 +; vmrlf %v17, %v24, %v24 +; vldeb %v19, %v17 +; vclgdb %v21, %v19, 0, 5 +; vpklsg %v24, %v7, %v21 +; br %r14 + +function %fcvt_to_sint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; block0: +; vmrhf %v3, %v24, %v24 +; vldeb %v5, %v3 +; vcgdb %v7, %v5, 0, 5 +; vmrlf %v17, %v24, %v24 +; vldeb %v19, %v17 +; vcgdb %v21, %v19, 0, 5 +; vpksg %v23, %v7, %v21 +; vgbm %v25, 0 +; vfcesb %v27, %v24, %v24 +; vsel %v24, %v23, %v25, %v27 +; br %r14 + +function %fcvt_to_uint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_uint_sat.i64x2 v0 + return v1 +} + +; block0: +; vclgdb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_sint_sat.i64x2 v0 + return v1 +} + +; block0: +; vcgdb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcedb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif b/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif new file mode 100644 index 000000000000..fe9e6fead830 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif @@ -0,0 +1,423 @@ +test compile precise-output +target s390x + +function %icmp_eq_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 eq v0, v1 + return v2 +} + +; block0: +; vceqg %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ne v0, v1 + return v2 +} + +; block0: +; vceqg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sgt v0, v1 + return v2 +} + +; block0: +; vchg %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 slt v0, v1 + return v2 +} + +; block0: +; vchg %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sge v0, v1 + return v2 +} + +; block0: +; vchg %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sle v0, v1 + return v2 +} + +; block0: +; vchg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ugt v0, v1 + return v2 +} + +; block0: +; vchlg %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ult v0, v1 + return v2 +} + +; block0: +; vchlg %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 uge v0, v1 + return v2 +} + +; block0: +; vchlg %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ule v0, v1 + return v2 +} + +; block0: +; vchlg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 eq v0, v1 + return v2 +} + +; block0: +; vceqf %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ne v0, v1 + return v2 +} + +; block0: +; vceqf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sgt v0, v1 + return v2 +} + +; block0: +; vchf %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 slt v0, v1 + return v2 +} + +; block0: +; vchf %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sge v0, v1 + return v2 +} + +; block0: +; vchf %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sle v0, v1 + return v2 +} + +; block0: +; vchf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ugt v0, v1 + return v2 +} + +; block0: +; vchlf %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ult v0, v1 + return v2 +} + +; block0: +; vchlf %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 uge v0, v1 + return v2 +} + +; block0: +; vchlf %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ule v0, v1 + return v2 +} + +; block0: +; vchlf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 eq v0, v1 + return v2 +} + +; block0: +; vceqh %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ne v0, v1 + return v2 +} + +; block0: +; vceqh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sgt v0, v1 + return v2 +} + +; block0: +; vchh %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 slt v0, v1 + return v2 +} + +; block0: +; vchh %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sge v0, v1 + return v2 +} + +; block0: +; vchh %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sle v0, v1 + return v2 +} + +; block0: +; vchh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ugt v0, v1 + return v2 +} + +; block0: +; vchlh %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ult v0, v1 + return v2 +} + +; block0: +; vchlh %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 uge v0, v1 + return v2 +} + +; block0: +; vchlh %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ule v0, v1 + return v2 +} + +; block0: +; vchlh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 eq v0, v1 + return v2 +} + +; block0: +; vceqb %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ne v0, v1 + return v2 +} + +; block0: +; vceqb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sgt v0, v1 + return v2 +} + +; block0: +; vchb %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 slt v0, v1 + return v2 +} + +; block0: +; vchb %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sge v0, v1 + return v2 +} + +; block0: +; vchb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sle v0, v1 + return v2 +} + +; block0: +; vchb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ugt v0, v1 + return v2 +} + +; block0: +; vchlb %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ult v0, v1 + return v2 +} + +; block0: +; vchlb %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 uge v0, v1 + return v2 +} + +; block0: +; vchlb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ule v0, v1 + return v2 +} + +; block0: +; vchlb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif new file mode 100644 index 000000000000..5ee1ef906fa6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif @@ -0,0 +1,807 @@ +test compile precise-output +target s390x arch13 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlbrreph %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif new file mode 100644 index 000000000000..7efa4e3b719a --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif @@ -0,0 +1,1964 @@ +test compile precise-output +target s390x + +function %insertlane_i64x2_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 1 +; br %r14 + +function %insertlane_i64x2_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 0 +; br %r14 + +function %insertlane_i64x2_imm_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vleig %v24, 123, 1 +; br %r14 + +function %insertlane_i64x2_imm_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vleig %v24, 123, 0 +; br %r14 + +function %insertlane_i64x2_lane_0_0(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_i64x2_lane_0_1(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_i64x2_lane_1_0(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_i64x2_lane_1_1(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_i32x4_0(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 3 +; br %r14 + +function %insertlane_i32x4_3(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 0 +; br %r14 + +function %insertlane_i32x4_imm_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vleif %v24, 123, 3 +; br %r14 + +function %insertlane_i32x4_imm_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vleif %v24, 123, 0 +; br %r14 + +function %insertlane_i32x4_lane_0_0(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_lane_0_3(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_0(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_3(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %insertlane_i16x8_0(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 7 +; br %r14 + +function %insertlane_i16x8_7(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 0 +; br %r14 + +function %insertlane_i16x8_imm_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vleih %v24, 123, 7 +; br %r14 + +function %insertlane_i16x8_imm_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vleih %v24, 123, 0 +; br %r14 + +function %insertlane_i16x8_lane_0_0(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 3 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_lane_0_7(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vreph %v5, %v25, 7 +; vgbm %v7, 49152 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_0(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vreph %v5, %v25, 0 +; vgbm %v7, 3 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_7(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vgbm %v5, 49152 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 0 +; br %r14 + +function %insertlane_i8x16_0(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 15 +; br %r14 + +function %insertlane_i8x16_15(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 0 +; br %r14 + +function %insertlane_i8x16_imm_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vleib %v24, 123, 15 +; br %r14 + +function %insertlane_i8x16_imm_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vleib %v24, 123, 0 +; br %r14 + +function %insertlane_i8x16_lane_0_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 1 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_lane_0_15(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vrepb %v5, %v25, 15 +; vgbm %v7, 32768 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vrepb %v5, %v25, 0 +; vgbm %v7, 1 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_15(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vgbm %v5, 32768 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_0(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 0 + return v2 +} + +; block0: +; vpdi %v24, %v24, %v0, 0 +; br %r14 + +function %insertlane_f64x2_1(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 1 + return v2 +} + +; block0: +; vpdi %v24, %v0, %v24, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_0(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_1(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_f64x2_lane_1_0(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_f64x2_lane_1_1(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_f32x4_0(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 0 + return v2 +} + +; block0: +; vrepf %v5, %v0, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_3(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 3 + return v2 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v0, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_0(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_3(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_0(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_3(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %extractlane_i64x2_0(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 1 +; br %r14 + +function %extractlane_i64x2_1(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_0(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 3 +; br %r14 + +function %extractlane_i32x4_3(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_0(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 7 +; br %r14 + +function %extractlane_i16x8_7(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 7 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 0 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i8x16_0(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 15 +; br %r14 + +function %extractlane_i8x16_15(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_0(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + return v1 +} + +; block0: +; vrepg %v0, %v24, 1 +; br %r14 + +function %extractlane_f64x2_1(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + return v1 +} + +; block0: +; vrepg %v0, %v24, 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_0(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + return v1 +} + +; block0: +; vrepf %v0, %v24, 3 +; br %r14 + +function %extractlane_f32x4_3(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + return v1 +} + +; block0: +; vrepf %v0, %v24, 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %splat_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; ldgr %f3, %r2 +; vrepg %v24, %v3, 0 +; br %r14 + +function %splat_i64x2_imm() -> i64x2 { +block0: + v0 = iconst.i64 123 + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; vrepig %v24, 123 +; br %r14 + +function %splat_i64x2_lane_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_i64x2_lane_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_i32x4(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vlvgf %v3, %r2, 0 +; vrepf %v24, %v3, 0 +; br %r14 + +function %splat_i32x4_imm() -> i32x4 { +block0: + v0 = iconst.i32 123 + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vrepif %v24, 123 +; br %r14 + +function %splat_i32x4_lane_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %splat_i16x8(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vlvgh %v3, %r2, 0 +; vreph %v24, %v3, 0 +; br %r14 + +function %splat_i16x8_imm() -> i16x8 { +block0: + v0 = iconst.i16 123 + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vrepih %v24, 123 +; br %r14 + +function %splat_i16x8_lane_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 7 +; br %r14 + +function %splat_i16x8_lane_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 0 +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; lrvh %r5, 0(%r2) +; vlvgh %v5, %r5, 0 +; vreph %v24, %v5, 0 +; br %r14 + +function %splat_i8x16(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vlvgb %v3, %r2, 0 +; vrepb %v24, %v3, 0 +; br %r14 + +function %splat_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i8 123 + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vrepib %v24, 123 +; br %r14 + +function %splat_i8x16_lane_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 15 +; br %r14 + +function %splat_i8x16_lane_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 0 +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; block0: +; vrepg %v24, %v0, 0 +; br %r14 + +function %splat_f64x2_lane_0(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_f64x2_lane_1(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v24, %v0, 0 +; br %r14 + +function %splat_f32x4_lane_0(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %scalar_to_vector_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgg %v24, %r2, 1 +; br %r14 + +function %scalar_to_vector_i64x2_imm() -> i64x2 { +block0: + v0 = iconst.i64 123 + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleig %v24, 123, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_i32x4(i32) -> i32x4 { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgf %v24, %r2, 3 +; br %r14 + +function %scalar_to_vector_i32x4_imm() -> i32x4 { +block0: + v0 = iconst.i32 123 + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleif %v24, 123, 3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %scalar_to_vector_i16x8(i16) -> i16x8 { +block0(v0: i16): + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgh %v24, %r2, 7 +; br %r14 + +function %scalar_to_vector_i16x8_imm() -> i16x8 { +block0: + v0 = iconst.i16 123 + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleih %v24, 123, 7 +; br %r14 + +function %scalar_to_vector_i16x8_lane_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v3, 3 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i16x8_lane_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vreph %v3, %v24, 0 +; vgbm %v5, 3 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %scalar_to_vector_i8x16(i8) -> i8x16 { +block0(v0: i8): + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgb %v24, %r2, 15 +; br %r14 + +function %scalar_to_vector_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i8 123 + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleib %v24, 123, 15 +; br %r14 + +function %scalar_to_vector_i8x16_lane_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v3, 1 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i8x16_lane_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v3, %v24, 0 +; vgbm %v5, 1 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = scalar_to_vector.f64x2 v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v0, 0 +; br %r14 + +function %scalar_to_vector_f64x2_lane_0(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_f64x2_lane_1(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v3, %v0, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_lane_0(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_f32x4_lane_3(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-logical.clif b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif new file mode 100644 index 000000000000..b0375f81dc10 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif @@ -0,0 +1,675 @@ +test compile precise-output +target s390x + +function %vany_true_i64x2(i64x2) -> b1 { +block0(v0: i64x2): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqgs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i32x4(i32x4) -> b1 { +block0(v0: i32x4): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqfs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i16x8(i16x8) -> b1 { +block0(v0: i16x8): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqhs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i8x16(i8x16) -> b1 { +block0(v0: i8x16): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqbs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vall_true_i64x2(i64x2) -> b1 { +block0(v0: i64x2): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqgs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i32x4(i32x4) -> b1 { +block0(v0: i32x4): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqfs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i16x8(i16x8) -> b1 { +block0(v0: i16x8): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqhs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i8x16(i8x16) -> b1 { +block0(v0: i8x16): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqbs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vany_true_icmp_eq_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp eq v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_ne_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ne v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_sgt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sgt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_sle_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sle v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_slt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp slt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_sge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_ugt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ugt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_ule_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ule v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_ult_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ult v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_uge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp uge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_eq_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp eq v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ne_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ne v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_gt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp gt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ule_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ule v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_ge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ult_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ult v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_lt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp lt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_uge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp uge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_le_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp le v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ugt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ugt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vall_true_icmp_eq_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp eq v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_ne_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ne v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_sgt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sgt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_sle_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sle v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_slt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp slt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_sge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_ugt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ugt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_ule_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ule v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_ult_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ult v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_uge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp uge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_eq_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp eq v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ne_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ne v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_gt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp gt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ule_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ule v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_ge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ult_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ult v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_lt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp lt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_uge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp uge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_le_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp le v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ugt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ugt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vhigh_bits(i64x2) -> i64 { +block0(v0: i64x2): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808080800040 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i32x4) -> i64 { +block0(v0: i32x4): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808000204060 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i16x8) -> i64 { +block0(v0: i16x8): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080800010203040506070 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i8x16) -> i64 { +block0(v0: i8x16): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00081018202830384048505860687078 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-permute.clif b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif new file mode 100644 index 000000000000..4e5f7019c5a4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif @@ -0,0 +1,493 @@ +test compile precise-output +target s390x + +function %swizzle(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = swizzle.i8x16 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vrepib %v7, 239 +; vno %v17, %v25, %v25 +; vmxlb %v19, %v7, %v17 +; vperm %v24, %v5, %v24, %v19 +; br %r14 + +function %shuffle_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; block0: +; vrepib %v5, 15 +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_1(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; block0: +; bras %r1, 20 ; data.u128 0x0a1e000d0b1702180403090b15100f0c ; vl %v5, 0(%r1) +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_2(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47] + return v2 +} + +; block0: +; vgbm %v5, 1 +; bras %r1, 20 ; data.u128 0x8080808080808080808080808080800f ; vl %v7, 0(%r1) +; vperm %v17, %v24, %v25, %v7 +; vn %v24, %v5, %v17 +; br %r14 + +function %shuffle_vmrhg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhb_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhb_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhb_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhb_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlb_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlb_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlb_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlb_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v25 +; br %r14 + +;; Special patterns that can be implemented via PACK. +function %shuffle_vpkg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v25 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif b/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif new file mode 100644 index 000000000000..7713bd0f3340 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif @@ -0,0 +1,427 @@ +test compile precise-output +target s390x + +function %rotr_i64x4_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = rotr.i64x2 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllg %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i64x4_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = rotr.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 47 +; br %r14 + +function %rotr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = rotr.i32x4 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllf %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = rotr.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 15 +; br %r14 + +function %rotr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = rotr.i16x8 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllh %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = rotr.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 6 +; br %r14 + +function %rotr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = rotr.i8x16 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllb %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = rotr.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 5 +; br %r14 + +function %rotl_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = rotl.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = rotl.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 17 +; br %r14 + +function %rotl_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = rotl.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = rotl.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 17 +; br %r14 + +function %rotl_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = rotl.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = rotl.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 10 +; br %r14 + +function %rotl_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = rotl.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 0(%r2) +; br %r14 + +function %rotr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = rotl.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 3 +; br %r14 + +function %ushr_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = ushr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrlg %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = ushr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrlg %v24, %v24, 17 +; br %r14 + +function %ushr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesrlf %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = ushr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesrlf %v24, %v24, 17 +; br %r14 + +function %ushr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = ushr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrlh %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = ushr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrlh %v24, %v24, 10 +; br %r14 + +function %ushr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = ushr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrlb %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = ushr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrlb %v24, %v24, 3 +; br %r14 + +function %ishl_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = ishl.i64x2 v0, v1 + return v2 +} + +; block0: +; veslg %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = ishl.i64x2 v0, v1 + return v2 +} + +; block0: +; veslg %v24, %v24, 17 +; br %r14 + +function %ishl_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ishl.i32x4 v0, v1 + return v2 +} + +; block0: +; veslf %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = ishl.i32x4 v0, v1 + return v2 +} + +; block0: +; veslf %v24, %v24, 17 +; br %r14 + +function %ishl_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = ishl.i16x8 v0, v1 + return v2 +} + +; block0: +; veslh %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = ishl.i16x8 v0, v1 + return v2 +} + +; block0: +; veslh %v24, %v24, 10 +; br %r14 + +function %ishl_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = ishl.i8x16 v0, v1 + return v2 +} + +; block0: +; veslb %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = ishl.i8x16 v0, v1 + return v2 +} + +; block0: +; veslb %v24, %v24, 3 +; br %r14 + +function %sshr_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = sshr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrag %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = sshr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrag %v24, %v24, 17 +; br %r14 + +function %sshr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = sshr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesraf %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = sshr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesraf %v24, %v24, 17 +; br %r14 + +function %sshr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = sshr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrah %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = sshr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrah %v24, %v24, 10 +; br %r14 + +function %sshr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = sshr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrab %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = sshr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrab %v24, %v24, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif new file mode 100644 index 000000000000..a60e7b619476 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif @@ -0,0 +1,375 @@ +test compile precise-output +target s390x arch13 + +function %uload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %store_i8x16_big(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %store_i8x16_little(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem.clif b/cranelift/filetests/filetests/isa/s390x/vecmem.clif new file mode 100644 index 000000000000..5cb297dde83b --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem.clif @@ -0,0 +1,463 @@ +test compile precise-output +target s390x + +function %uload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %store_i8x16_big(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhb %v24, %v5 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhh %v24, %v5 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhf %v24, %v5 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphb %v24, %v5 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphh %v24, %v5 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphf %v24, %v5 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_sum_little(i64, i64) -> f64x2 { +block0(v0: i64, v1: i64): + v2 = iadd.i64 v0, v1 + v3 = load.f64x2 little v2 + return v3 +} + +; block0: +; lrvg %r4, 0(%r3,%r2) +; lrvg %r5, 8(%r3,%r2) +; vlvgp %v24, %r5, %r4 +; br %r14 + +function %load_f64x2_off_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0+128 + return v1 +} + +; block0: +; lrvg %r5, 128(%r2) +; lrvg %r3, 136(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %store_i8x16_little(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_sum_little(f64x2, i64, i64) { +block0(v0: f64x2, v1: i64, v2: i64): + v3 = iadd.i64 v1, v2 + store.f64x2 little v0, v3 + return +} + +; block0: +; vlgvg %r5, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r5, 0(%r3,%r2) +; strvg %r4, 8(%r3,%r2) +; br %r14 + +function %store_f64x2_off_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1+128 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 128(%r2) +; strvg %r4, 136(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif index acd7b290dd8c..f5bf2a002ad1 100644 --- a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif @@ -2,6 +2,7 @@ test interpret test run target x86_64 target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug function %fmax_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): diff --git a/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif b/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif index 9bbba57559ab..520f3aaff85b 100644 --- a/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif +++ b/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif @@ -1,7 +1,7 @@ test run -; target s390x TODO: Not yet implemented on s390x set enable_simd target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug target x86_64 skylake function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif index fc88e34611f1..cb4857d8daba 100644 --- a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif @@ -2,6 +2,7 @@ test interpret test run target x86_64 target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug function %fmin_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): diff --git a/cranelift/filetests/filetests/runtests/shifts-small-types.clif b/cranelift/filetests/filetests/runtests/shifts-small-types.clif index eae20a8ef01e..9b2207a3933d 100644 --- a/cranelift/filetests/filetests/runtests/shifts-small-types.clif +++ b/cranelift/filetests/filetests/runtests/shifts-small-types.clif @@ -1,7 +1,8 @@ test run target aarch64 +target s390x -; TODO: Merge this with the main shifts file when x86_64 & s390x passes these. +; TODO: Merge this with the main shifts file when x86_64 passes these. function %ishl_i16_i64(i16, i64) -> i16 { block0(v0: i16, v1: i64): diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif index f657d1e5336e..92ffddeef20f 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif @@ -3,6 +3,7 @@ ; simd-arithmetic-nondeterministic*.clif as well. test run target aarch64 +target s390x function %fmax_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index 1ca8e8fcfa3b..58a0dc1c21f6 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif index a2086b0426b7..4021e89fee42 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set opt_level=speed_and_size set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif index 3d67ff290504..18027373f8c8 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif @@ -1,6 +1,7 @@ test run set enable_simd target aarch64 +target s390x target x86_64 has_sse3 has_ssse3 has_sse41 function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif index af7b24d5e64a..ce3ffa5321df 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif @@ -1,6 +1,7 @@ test run set enable_simd target aarch64 +; target s390x FIXME: s390x implements modulo semantics for shift counts target x86_64 skylake ; TODO: once available, replace all lane extraction with `icmp + all_ones` diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise.clif b/cranelift/filetests/filetests/runtests/simd-bitwise.clif index 670844db22bf..251f9516c15d 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitwise.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitwise.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +; target s390x FIXME: s390x implements modulo semantics for shift counts set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-comparison.clif b/cranelift/filetests/filetests/runtests/simd-comparison.clif index cd8341127d3f..dd8c6a80b2ef 100644 --- a/cranelift/filetests/filetests/runtests/simd-comparison.clif +++ b/cranelift/filetests/filetests/runtests/simd-comparison.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-conversion.clif b/cranelift/filetests/filetests/runtests/simd-conversion.clif index 2903a34e1680..da8bced238af 100644 --- a/cranelift/filetests/filetests/runtests/simd-conversion.clif +++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane.clif b/cranelift/filetests/filetests/runtests/simd-extractlane.clif index 6aba1c67ba61..471130f8c252 100644 --- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-iabs.clif b/cranelift/filetests/filetests/runtests/simd-iabs.clif index 95b53844c624..b9d6468f9c51 100644 --- a/cranelift/filetests/filetests/runtests/simd-iabs.clif +++ b/cranelift/filetests/filetests/runtests/simd-iabs.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif index 92f5d776fe6d..3deeb6cddd20 100644 --- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif +++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x function %iaddp_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif index f3624f1bf19f..e0965d8324e3 100644 --- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 @@ -30,4 +31,4 @@ block0(v0: i64x2, v1: i64): v2 = insertlane v0, v1, 0 return v2 } -; run: %insertlane_0([1 1], 5000000000) == [5000000000 1] \ No newline at end of file +; run: %insertlane_0([1 1], 5000000000) == [5000000000 1] diff --git a/cranelift/filetests/filetests/runtests/simd-lane-access.clif b/cranelift/filetests/filetests/runtests/simd-lane-access.clif index 0818bdd85b89..d43a0e20cf63 100644 --- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif +++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-logical.clif b/cranelift/filetests/filetests/runtests/simd-logical.clif index 081d4892de81..406ea9698ddd 100644 --- a/cranelift/filetests/filetests/runtests/simd-logical.clif +++ b/cranelift/filetests/filetests/runtests/simd-logical.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-min-max.clif b/cranelift/filetests/filetests/runtests/simd-min-max.clif index 54653616ffbf..7a4cc0a0784d 100644 --- a/cranelift/filetests/filetests/runtests/simd-min-max.clif +++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif @@ -1,6 +1,7 @@ test run target aarch64 target x86_64 +target s390x function %imin_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-saddsat.clif b/cranelift/filetests/filetests/runtests/simd-saddsat.clif index 515cc83a4a2e..104041204932 100644 --- a/cranelift/filetests/filetests/runtests/simd-saddsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-saddsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index b7850a578165..eaabb23768cf 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-snarrow.clif b/cranelift/filetests/filetests/runtests/simd-snarrow.clif index 082e86c179f2..86d3ee2100bc 100644 --- a/cranelift/filetests/filetests/runtests/simd-snarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-snarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index 19892cb29b7c..1cfef52c78b4 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif index f6809ddc5c80..91554360b664 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ;; x86_64 hasn't implemented this for `i32x4` function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif index 723696d25a8f..d7d3ffec7b28 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-ssubsat.clif b/cranelift/filetests/filetests/runtests/simd-ssubsat.clif index 8841f2275f66..1a517b483a8e 100644 --- a/cranelift/filetests/filetests/runtests/simd-ssubsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-ssubsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif index 1d9c15581b76..169c9122e376 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif index bee577072d13..6c014ad4a4d0 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif index 390780879c77..e1c7fba879da 100644 --- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif +++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uaddsat.clif b/cranelift/filetests/filetests/runtests/simd-uaddsat.clif index d0af940abdee..5610d4d36c3a 100644 --- a/cranelift/filetests/filetests/runtests/simd-uaddsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-uaddsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-unarrow.clif b/cranelift/filetests/filetests/runtests/simd-unarrow.clif index e535df5e0778..f15a3217570c 100644 --- a/cranelift/filetests/filetests/runtests/simd-unarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-unarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-usubsat.clif b/cranelift/filetests/filetests/runtests/simd-usubsat.clif index ca8747c3e900..55a85c8c895b 100644 --- a/cranelift/filetests/filetests/runtests/simd-usubsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-usubsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uunarrow.clif b/cranelift/filetests/filetests/runtests/simd-uunarrow.clif index b2a68c44802f..11ff104db149 100644 --- a/cranelift/filetests/filetests/runtests/simd-uunarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uunarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ; x86_64 panics: `Did not match fcvt input! ; thread 'worker #0' panicked at 'register allocation: Analysis(EntryLiveinValues([v2V]))', cranelift/codegen/src/machinst/compile.rs:96:10` diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif index 959b6acd7336..aaf8d4102f99 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif index fab64406b946..90f14bb1d331 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue.clif b/cranelift/filetests/filetests/runtests/simd-valltrue.clif index c39a2702e6a4..c799893ac8e2 100644 --- a/cranelift/filetests/filetests/runtests/simd-valltrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-valltrue.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %vall_true_b8x16(b8x16) -> b1 { diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif index 74b99d785e4c..28e1c60a7d50 100644 --- a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %vany_true_b8x16(b8x16) -> b1 { diff --git a/cranelift/filetests/filetests/runtests/simd-vconst.clif b/cranelift/filetests/filetests/runtests/simd-vconst.clif index b2398b6ec01a..5aa5386484f4 100644 --- a/cranelift/filetests/filetests/runtests/simd-vconst.clif +++ b/cranelift/filetests/filetests/runtests/simd-vconst.clif @@ -1,5 +1,5 @@ test run -; target s390x TODO: Not yet implemented on s390x +target s390x target aarch64 set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif index e4ed0e42cf7e..d22abb702414 100644 --- a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif +++ b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-vselect.clif b/cranelift/filetests/filetests/runtests/simd-vselect.clif index 53ef6f6353d7..db5f9180433e 100644 --- a/cranelift/filetests/filetests/runtests/simd-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif @@ -1,6 +1,6 @@ test interpret test run -; target s390x TODO: Not yet implemented on s390x +target s390x target aarch64 set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif index dcfaba0294a9..c38099c429f0 100644 --- a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif +++ b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif index d3a8c655d171..445ccbcc148b 100644 --- a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif +++ b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif @@ -1,5 +1,6 @@ test run target aarch64 +target s390x ; raw_bitcast is needed to get around issue with "bint" on aarch64