diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 8abee1ddb0fb..54886b010ed2 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -287,6 +287,30 @@ fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable) - | machreg_to_vec(rd.to_reg()) } +fn enc_vec_rrr_long( + q: u32, + u: u32, + size: u32, + bit14: u32, + rm: Reg, + rn: Reg, + rd: Writable, +) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(u & 0b1, u); + debug_assert_eq!(size & 0b11, size); + debug_assert_eq!(bit14 & 0b1, bit14); + + 0b0_0_0_01110_00_1_00000_100000_00000_00000 + | q << 30 + | u << 29 + | size << 22 + | bit14 << 14 + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable) -> u32 { (0b01011010110 << 21) | size << 31 @@ -2173,6 +2197,34 @@ impl MachInstEmit for Inst { sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn)); } + &Inst::VecRRRLong { + rd, + rn, + rm, + alu_op, + high_half, + } => { + let (u, size, bit14) = match alu_op { + VecRRRLongOp::Smull8 => (0b0, 0b00, 0b1), + VecRRRLongOp::Smull16 => (0b0, 0b01, 0b1), + VecRRRLongOp::Smull32 => (0b0, 0b10, 0b1), + VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1), + VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1), + VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1), + VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0), + VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0), + VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0), + }; + sink.put4(enc_vec_rrr_long( + high_half as u32, + u, + size, + bit14, + rm, + rn, + rd, + )); + } &Inst::VecRRR { rd, rn, @@ -2242,13 +2294,7 @@ impl MachInstEmit for Inst { VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), - VecALUOp::Umlal => { - debug_assert!(!size.is_128bits()); - (0b001_01110_00_1 | enc_size << 1, 0b100000) - } VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110), - VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000), - VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000), VecALUOp::Sqrdmulh => { debug_assert!( size.lane_size() == ScalarSize::Size16 @@ -2258,12 +2304,12 @@ impl MachInstEmit for Inst { (0b001_01110_00_1 | enc_size << 1, 0b101101) } }; - let top11 = match alu_op { - VecALUOp::Smull | VecALUOp::Smull2 => top11, - _ if is_float => top11 | (q << 9) | enc_float_size << 1, - _ => top11 | (q << 9), + let top11 = if is_float { + top11 | enc_float_size << 1 + } else { + top11 }; - sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); + sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd)); } &Inst::VecLoadReplicate { rd, rn, size } => { let (q, size) = size.enc_size(); diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 53f68a994a0b..d3afca2a777e 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -3651,18 +3651,6 @@ fn test_aarch64_binemit() { "addp v8.4s, v12.4s, v14.4s", )); - insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Umlal, - rd: writable_vreg(9), - rn: vreg(20), - rm: vreg(17), - size: VectorSize::Size32x2, - }, - "8982B12E", - "umlal v9.2d, v20.2s, v17.2s", - )); - insns.push(( Inst::VecRRR { alu_op: VecALUOp::Zip1, @@ -3712,77 +3700,221 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull8, rd: writable_vreg(16), rn: vreg(12), rm: vreg(1), - size: VectorSize::Size8x16, + high_half: false, }, "90C1210E", "smull v16.8h, v12.8b, v1.8b", )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull8, + rd: writable_vreg(15), + rn: vreg(11), + rm: vreg(2), + high_half: false, + }, + "6FC1222E", + "umull v15.8h, v11.8b, v2.8b", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal8, + rd: writable_vreg(4), + rn: vreg(8), + rm: vreg(16), + high_half: false, + }, + "0481302E", + "umlal v4.8h, v8.8b, v16.8b", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull16, rd: writable_vreg(2), rn: vreg(13), rm: vreg(6), - size: VectorSize::Size16x8, + high_half: false, }, "A2C1660E", "smull v2.4s, v13.4h, v6.4h", )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull16, + rd: writable_vreg(3), + rn: vreg(14), + rm: vreg(7), + high_half: false, + }, + "C3C1672E", + "umull v3.4s, v14.4h, v7.4h", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal16, + rd: writable_vreg(7), + rn: vreg(14), + rm: vreg(21), + high_half: false, + }, + "C781752E", + "umlal v7.4s, v14.4h, v21.4h", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull32, rd: writable_vreg(8), rn: vreg(12), rm: vreg(14), - size: VectorSize::Size32x4, + high_half: false, }, "88C1AE0E", "smull v8.2d, v12.2s, v14.2s", )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull2, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull32, + rd: writable_vreg(9), + rn: vreg(5), + rm: vreg(6), + high_half: false, + }, + "A9C0A62E", + "umull v9.2d, v5.2s, v6.2s", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal32, + rd: writable_vreg(9), + rn: vreg(20), + rm: vreg(17), + high_half: false, + }, + "8982B12E", + "umlal v9.2d, v20.2s, v17.2s", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull8, rd: writable_vreg(16), rn: vreg(12), rm: vreg(1), - size: VectorSize::Size8x16, + high_half: true, }, "90C1214E", "smull2 v16.8h, v12.16b, v1.16b", )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull2, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull8, + rd: writable_vreg(29), + rn: vreg(22), + rm: vreg(10), + high_half: true, + }, + "DDC22A6E", + "umull2 v29.8h, v22.16b, v10.16b", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal8, + rd: writable_vreg(1), + rn: vreg(5), + rm: vreg(15), + high_half: true, + }, + "A1802F6E", + "umlal2 v1.8h, v5.16b, v15.16b", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull16, rd: writable_vreg(2), rn: vreg(13), rm: vreg(6), - size: VectorSize::Size16x8, + high_half: true, }, "A2C1664E", "smull2 v2.4s, v13.8h, v6.8h", )); insns.push(( - Inst::VecRRR { - alu_op: VecALUOp::Smull2, + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull16, + rd: writable_vreg(19), + rn: vreg(18), + rm: vreg(17), + high_half: true, + }, + "53C2716E", + "umull2 v19.4s, v18.8h, v17.8h", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal16, + rd: writable_vreg(11), + rn: vreg(10), + rm: vreg(12), + high_half: true, + }, + "4B816C6E", + "umlal2 v11.4s, v10.8h, v12.8h", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull32, rd: writable_vreg(8), rn: vreg(12), rm: vreg(14), - size: VectorSize::Size32x4, + high_half: true, }, "88C1AE4E", "smull2 v8.2d, v12.4s, v14.4s", )); + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umull32, + rd: writable_vreg(4), + rn: vreg(12), + rm: vreg(16), + high_half: true, + }, + "84C1B06E", + "umull2 v4.2d, v12.4s, v16.4s", + )); + + insns.push(( + Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal32, + rd: writable_vreg(10), + rn: vreg(29), + rm: vreg(2), + high_half: true, + }, + "AA83A26E", + "umlal2 v10.2d, v29.4s, v2.4s", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Sqrdmulh, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 0312a7626ece..8c993492bd2e 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -303,14 +303,8 @@ pub enum VecALUOp { Fmul, /// Add pairwise Addp, - /// Unsigned multiply add long - Umlal, /// Zip vectors (primary) [meaning, high halves] Zip1, - /// Signed multiply long (low halves) - Smull, - /// Signed multiply long (high halves) - Smull2, /// Signed saturating rounding doubling multiply returning high half Sqrdmulh, } @@ -402,6 +396,22 @@ pub enum VecRRNarrowOp { Fcvtn64, } +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecRRRLongOp { + /// Signed multiply long. + Smull8, + Smull16, + Smull32, + /// Unsigned multiply long. + Umull8, + Umull16, + Umull32, + /// Unsigned multiply add long + Umlal8, + Umlal16, + Umlal32, +} + /// A vector operation on a pair of elements with one register. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum VecPairOp { @@ -1087,6 +1097,16 @@ pub enum Inst { rn: Reg, }, + /// 2-operand vector instruction that produces a result with twice the + /// lane width and half the number of lanes. + VecRRRLong { + alu_op: VecRRRLongOp, + rd: Writable, + rn: Reg, + rm: Reg, + high_half: bool, + }, + /// A vector ALU op. VecRRR { alu_op: VecALUOp, @@ -2134,10 +2154,22 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + &Inst::VecRRRLong { + alu_op, rd, rn, rm, .. + } => { + match alu_op { + VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => { + collector.add_mod(rd) + } + _ => collector.add_def(rd), + }; + collector.add_use(rn); + collector.add_use(rm); + } &Inst::VecRRR { alu_op, rd, rn, rm, .. } => { - if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { + if alu_op == VecALUOp::Bsl { collector.add_mod(rd); } else { collector.add_def(rd); @@ -2944,6 +2976,22 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::VecRRRLong { + alu_op, + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + match alu_op { + VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => { + map_mod(mapper, rd) + } + _ => map_def(mapper, rd), + }; + map_use(mapper, rn); + map_use(mapper, rm); + } &mut Inst::VecRRR { alu_op, ref mut rd, @@ -2951,7 +2999,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { ref mut rm, .. } => { - if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { + if alu_op == VecALUOp::Bsl { map_mod(mapper, rd); } else { map_def(mapper, rd); @@ -4147,24 +4195,80 @@ impl Inst { VecALUOp::Fmin => ("fmin", size), VecALUOp::Fmul => ("fmul", size), VecALUOp::Addp => ("addp", size), - VecALUOp::Umlal => ("umlal", size), VecALUOp::Zip1 => ("zip1", size), - VecALUOp::Smull => ("smull", size), - VecALUOp::Smull2 => ("smull2", size), VecALUOp::Sqrdmulh => ("sqrdmulh", size), }; - let rd_size = match alu_op { - VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(), - _ => size, - }; - let rn_size = match alu_op { - VecALUOp::Smull => size.halve(), - _ => size, + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = show_vreg_vector(rn, mb_rru, size); + let rm = show_vreg_vector(rm, mb_rru, size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::VecRRRLong { + rd, + rn, + rm, + alu_op, + high_half, + } => { + let (op, dest_size, src_size) = match (alu_op, high_half) { + (VecRRRLongOp::Smull8, false) => { + ("smull", VectorSize::Size16x8, VectorSize::Size8x8) + } + (VecRRRLongOp::Smull8, true) => { + ("smull2", VectorSize::Size16x8, VectorSize::Size8x16) + } + (VecRRRLongOp::Smull16, false) => { + ("smull", VectorSize::Size32x4, VectorSize::Size16x4) + } + (VecRRRLongOp::Smull16, true) => { + ("smull2", VectorSize::Size32x4, VectorSize::Size16x8) + } + (VecRRRLongOp::Smull32, false) => { + ("smull", VectorSize::Size64x2, VectorSize::Size32x2) + } + (VecRRRLongOp::Smull32, true) => { + ("smull2", VectorSize::Size64x2, VectorSize::Size32x4) + } + (VecRRRLongOp::Umull8, false) => { + ("umull", VectorSize::Size16x8, VectorSize::Size8x8) + } + (VecRRRLongOp::Umull8, true) => { + ("umull2", VectorSize::Size16x8, VectorSize::Size8x16) + } + (VecRRRLongOp::Umull16, false) => { + ("umull", VectorSize::Size32x4, VectorSize::Size16x4) + } + (VecRRRLongOp::Umull16, true) => { + ("umull2", VectorSize::Size32x4, VectorSize::Size16x8) + } + (VecRRRLongOp::Umull32, false) => { + ("umull", VectorSize::Size64x2, VectorSize::Size32x2) + } + (VecRRRLongOp::Umull32, true) => { + ("umull2", VectorSize::Size64x2, VectorSize::Size32x4) + } + (VecRRRLongOp::Umlal8, false) => { + ("umlal", VectorSize::Size16x8, VectorSize::Size8x8) + } + (VecRRRLongOp::Umlal8, true) => { + ("umlal2", VectorSize::Size16x8, VectorSize::Size8x16) + } + (VecRRRLongOp::Umlal16, false) => { + ("umlal", VectorSize::Size32x4, VectorSize::Size16x4) + } + (VecRRRLongOp::Umlal16, true) => { + ("umlal2", VectorSize::Size32x4, VectorSize::Size16x8) + } + (VecRRRLongOp::Umlal32, false) => { + ("umlal", VectorSize::Size64x2, VectorSize::Size32x2) + } + (VecRRRLongOp::Umlal32, true) => { + ("umlal2", VectorSize::Size64x2, VectorSize::Size32x4) + } }; - let rm_size = rn_size; - let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size); - let rn = show_vreg_vector(rn, mb_rru, rn_size); - let rm = show_vreg_vector(rm, mb_rru, rm_size); + let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size); + let rn = show_vreg_vector(rn, mb_rru, src_size); + let rm = show_vreg_vector(rm, mb_rru, src_size); format!("{} {}, {}, {}", op, rd, rn, rm) } &Inst::VecMisc { op, rd, rn, size } => { diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index ededece15cd1..12535cf3826c 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1253,6 +1253,153 @@ pub(crate) fn maybe_input_insn_via_conv>( None } +/// Pattern match an extending vector multiplication. +/// Returns a tuple of the opcode to use, the two input registers and whether +/// it's the 'high half' version of the instruction. +pub(crate) fn match_vec_long_mul>( + c: &mut C, + insn: IRInst, + ext_op: Opcode, +) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> { + let inputs = insn_inputs(c, insn); + if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) { + if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) { + let lhs_input = insn_inputs(c, lhs)[0]; + let rhs_input = insn_inputs(c, rhs)[0]; + let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None); + let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None); + let lane_type = c.output_ty(insn, 0).lane_type(); + match (lane_type, ext_op) { + (I16, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull8, rn, rm, false)), + (I16, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull8, rn, rm, true)), + (I16, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull8, rn, rm, false)), + (I16, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull8, rn, rm, true)), + (I32, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull16, rn, rm, false)), + (I32, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull16, rn, rm, true)), + (I32, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull16, rn, rm, false)), + (I32, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull16, rn, rm, true)), + (I64, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull32, rn, rm, false)), + (I64, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull32, rn, rm, true)), + (I64, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull32, rn, rm, false)), + (I64, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull32, rn, rm, true)), + _ => {} + }; + } + } + None +} + +pub(crate) fn lower_i64x2_mul>(c: &mut C, insn: IRInst) { + let inputs = insn_inputs(c, insn); + let outputs = insn_outputs(c, insn); + let rd = get_output_reg(c, outputs[0]).regs()[0]; + let rn = put_input_in_regs(c, inputs[0]).regs()[0]; + let rm = put_input_in_regs(c, inputs[1]).regs()[0]; + + let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap(); + let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap(); + + // This I64X2 multiplication is performed with several 32-bit + // operations. + + // 64-bit numbers x and y, can be represented as: + // x = a + 2^32(b) + // y = c + 2^32(d) + + // A 64-bit multiplication is: + // x * y = ac + 2^32(ad + bc) + 2^64(bd) + // note: `2^64(bd)` can be ignored, the value is too large to fit in + // 64 bits. + + // This sequence implements a I64X2 multiply, where the registers + // `rn` and `rm` are split up into 32-bit components: + // rn = |d|c|b|a| + // rm = |h|g|f|e| + // + // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)| + // + // The sequence is: + // rev64 rd.4s, rm.4s + // mul rd.4s, rd.4s, rn.4s + // xtn tmp1.2s, rn.2d + // addp rd.4s, rd.4s, rd.4s + // xtn tmp2.2s, rm.2d + // shll rd.2d, rd.2s, #32 + // umlal rd.2d, tmp2.2s, tmp1.2s + + // Reverse the 32-bit elements in the 64-bit words. + // rd = |g|h|e|f| + c.emit(Inst::VecMisc { + op: VecMisc2::Rev64, + rd, + rn: rm, + size: VectorSize::Size32x4, + }); + + // Calculate the high half components. + // rd = |dg|ch|be|af| + // + // Note that this 32-bit multiply of the high half + // discards the bits that would overflow, same as + // if 64-bit operations were used. Also the Shll + // below would shift out the overflow bits anyway. + c.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn: rd.to_reg(), + rm: rn, + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rn. + // tmp1 = |c|a| + c.emit(Inst::VecRRNarrow { + op: VecRRNarrowOp::Xtn64, + rd: tmp1, + rn, + high_half: false, + }); + + // Sum the respective high half components. + // rd = |dg+ch|be+af||dg+ch|be+af| + c.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: rd, + rn: rd.to_reg(), + rm: rd.to_reg(), + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rm. + // tmp2 = |g|e| + c.emit(Inst::VecRRNarrow { + op: VecRRNarrowOp::Xtn64, + rd: tmp2, + rn: rm, + high_half: false, + }); + + // Shift the high half components, into the high half. + // rd = |dg+ch << 32|be+af << 32| + c.emit(Inst::VecRRLong { + op: VecRRLongOp::Shll32, + rd, + rn: rd.to_reg(), + high_half: false, + }); + + // Multiply the low components together, and accumulate with the high + // half. + // rd = |rd[1] + cg|rd[0] + ae| + c.emit(Inst::VecRRRLong { + alu_op: VecRRRLongOp::Umlal32, + rd, + rn: tmp2.to_reg(), + rm: tmp1.to_reg(), + high_half: false, + }); +} + /// Specifies what [lower_icmp] should do when lowering #[derive(Debug, Clone, PartialEq)] pub(crate) enum IcmpOutput { diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2ea314b726f5..754e2f7b9501 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -244,174 +244,79 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Imul => { - let lhs = put_input_in_regs(ctx, inputs[0]); - let rhs = put_input_in_regs(ctx, inputs[1]); - let dst = get_output_reg(ctx, outputs[0]); - - let rd = dst.regs()[0]; - let rn = lhs.regs()[0]; - let rm = rhs.regs()[0]; - let ty = ty.unwrap(); - match ty { - I128 => { - assert_eq!(lhs.len(), 2); - assert_eq!(rhs.len(), 2); - assert_eq!(dst.len(), 2); - - // 128bit mul formula: - // dst_lo = lhs_lo * rhs_lo - // dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo) - // - // We can convert the above formula into the following - // umulh dst_hi, lhs_lo, rhs_lo - // madd dst_hi, lhs_lo, rhs_hi, dst_hi - // madd dst_hi, lhs_hi, rhs_lo, dst_hi - // mul dst_lo, lhs_lo, rhs_lo - - ctx.emit(Inst::AluRRR { - alu_op: ALUOp::UMulH, - rd: dst.regs()[1], - rn: lhs.regs()[0], - rm: rhs.regs()[0], - }); - ctx.emit(Inst::AluRRRR { - alu_op: ALUOp3::MAdd64, - rd: dst.regs()[1], - rn: lhs.regs()[0], - rm: rhs.regs()[1], - ra: dst.regs()[1].to_reg(), - }); - ctx.emit(Inst::AluRRRR { - alu_op: ALUOp3::MAdd64, - rd: dst.regs()[1], - rn: lhs.regs()[1], - rm: rhs.regs()[0], - ra: dst.regs()[1].to_reg(), - }); - ctx.emit(Inst::AluRRRR { - alu_op: ALUOp3::MAdd64, - rd: dst.regs()[0], - rn: lhs.regs()[0], - rm: rhs.regs()[0], - ra: zero_reg(), - }); - } - ty if !ty.is_vector() => { - let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); - ctx.emit(Inst::AluRRRR { - alu_op, - rd, - rn, - rm, - ra: zero_reg(), - }); - } - I64X2 => { - let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap(); - - // This I64X2 multiplication is performed with several 32-bit - // operations. - - // 64-bit numbers x and y, can be represented as: - // x = a + 2^32(b) - // y = c + 2^32(d) - - // A 64-bit multiplication is: - // x * y = ac + 2^32(ad + bc) + 2^64(bd) - // note: `2^64(bd)` can be ignored, the value is too large to fit in - // 64 bits. - - // This sequence implements a I64X2 multiply, where the registers - // `rn` and `rm` are split up into 32-bit components: - // rn = |d|c|b|a| - // rm = |h|g|f|e| - // - // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)| - // - // The sequence is: - // rev64 rd.4s, rm.4s - // mul rd.4s, rd.4s, rn.4s - // xtn tmp1.2s, rn.2d - // addp rd.4s, rd.4s, rd.4s - // xtn tmp2.2s, rm.2d - // shll rd.2d, rd.2s, #32 - // umlal rd.2d, tmp2.2s, tmp1.2s - - // Reverse the 32-bit elements in the 64-bit words. - // rd = |g|h|e|f| - ctx.emit(Inst::VecMisc { - op: VecMisc2::Rev64, - rd, - rn: rm, - size: VectorSize::Size32x4, - }); - - // Calculate the high half components. - // rd = |dg|ch|be|af| - // - // Note that this 32-bit multiply of the high half - // discards the bits that would overflow, same as - // if 64-bit operations were used. Also the Shll - // below would shift out the overflow bits anyway. - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Mul, - rd, - rn: rd.to_reg(), - rm: rn, - size: VectorSize::Size32x4, - }); - - // Extract the low half components of rn. - // tmp1 = |c|a| - ctx.emit(Inst::VecRRNarrow { - op: VecRRNarrowOp::Xtn64, - rd: tmp1, - rn, - high_half: false, - }); - - // Sum the respective high half components. - // rd = |dg+ch|be+af||dg+ch|be+af| - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Addp, - rd: rd, - rn: rd.to_reg(), - rm: rd.to_reg(), - size: VectorSize::Size32x4, - }); - - // Extract the low half components of rm. - // tmp2 = |g|e| - ctx.emit(Inst::VecRRNarrow { - op: VecRRNarrowOp::Xtn64, - rd: tmp2, - rn: rm, - high_half: false, - }); + if ty == I128 { + let lhs = put_input_in_regs(ctx, inputs[0]); + let rhs = put_input_in_regs(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + assert_eq!(lhs.len(), 2); + assert_eq!(rhs.len(), 2); + assert_eq!(dst.len(), 2); - // Shift the high half components, into the high half. - // rd = |dg+ch << 32|be+af << 32| - ctx.emit(Inst::VecRRLong { - op: VecRRLongOp::Shll32, - rd, - rn: rd.to_reg(), - high_half: false, - }); + // 128bit mul formula: + // dst_lo = lhs_lo * rhs_lo + // dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo) + // + // We can convert the above formula into the following + // umulh dst_hi, lhs_lo, rhs_lo + // madd dst_hi, lhs_lo, rhs_hi, dst_hi + // madd dst_hi, lhs_hi, rhs_lo, dst_hi + // mul dst_lo, lhs_lo, rhs_lo - // Multiply the low components together, and accumulate with the high - // half. - // rd = |rd[1] + cg|rd[0] + ae| - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Umlal, - rd, - rn: tmp2.to_reg(), - rm: tmp1.to_reg(), - size: VectorSize::Size32x2, - }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::UMulH, + rd: dst.regs()[1], + rn: lhs.regs()[0], + rm: rhs.regs()[0], + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[1], + rn: lhs.regs()[0], + rm: rhs.regs()[1], + ra: dst.regs()[1].to_reg(), + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[1], + rn: lhs.regs()[1], + rm: rhs.regs()[0], + ra: dst.regs()[1].to_reg(), + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[0], + rn: lhs.regs()[0], + rm: rhs.regs()[0], + ra: zero_reg(), + }); + } else if ty.is_vector() { + for ext_op in &[ + Opcode::SwidenLow, + Opcode::SwidenHigh, + Opcode::UwidenLow, + Opcode::UwidenHigh, + ] { + if let Some((alu_op, rn, rm, high_half)) = + match_vec_long_mul(ctx, insn, *ext_op) + { + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::VecRRRLong { + alu_op, + rd, + rn, + rm, + high_half, + }); + return Ok(()); + } } - ty if ty.is_vector() => { + if ty == I64X2 { + lower_i64x2_mul(ctx, insn); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); ctx.emit(Inst::VecRRR { alu_op: VecALUOp::Mul, rd, @@ -420,7 +325,18 @@ pub(crate) fn lower_insn_to_regs>( size: VectorSize::from_ty(ty), }); } - _ => panic!("Unable to emit mul for {}", ty), + } else { + let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra: zero_reg(), + }); } } @@ -2740,19 +2656,19 @@ pub(crate) fn lower_insn_to_regs>( // => smull tmp, a, b // smull2 y, a, b // addp y, tmp, y - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Smull, + ctx.emit(Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull16, rd: tmp, rn: r_a, rm: r_b, - size: VectorSize::Size16x8, + high_half: false, }); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Smull2, + ctx.emit(Inst::VecRRRLong { + alu_op: VecRRRLongOp::Smull16, rd: r_y, rn: r_a, rm: r_b, - size: VectorSize::Size16x8, + high_half: true, }); ctx.emit(Inst::VecRRR { alu_op: VecALUOp::Addp, diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif b/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif new file mode 100644 index 000000000000..ca9b3e2fae21 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif @@ -0,0 +1,159 @@ +test compile +set unwind_info=false +target aarch64 + +function %fn1(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull v0.8h, v0.8b, v1.8b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn2(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull2 v0.8h, v0.16b, v1.16b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn3(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull v0.4s, v0.4h, v1.4h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn4(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull2 v0.4s, v0.8h, v1.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn5(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull v0.2d, v0.2s, v1.2s +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn6(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: sxtl +; check: smull2 v0.2d, v0.4s, v1.4s +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn7(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl +; check: umull v0.8h, v0.8b, v1.8b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn8(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl +; check: umull2 v0.8h, v0.16b, v1.16b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn9(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl +; check: umull v0.4s, v0.4h, v1.4h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn10(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl +; check: umull2 v0.4s, v0.8h, v1.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn11(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl +; check: umull v0.2d, v0.2s, v1.2s +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn12(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; check-not: uxtl2 +; check: umull2 v0.2d, v0.4s, v1.4s +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 8be5b24d8ec9..864cb10f9d66 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1908,7 +1908,6 @@ pub fn translate_operator( } Operator::I16x8Q15MulrSatS => { let (a, b) = pop2_with_bitcast(state, I16X8, builder); - state.push1(builder.ins().sqmul_round_sat(a, b)) } Operator::I16x8ExtMulLowI8x16S => {