From 8f941331dc294d28f95f65119f850f627a8e6a5f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 13 Sep 2024 16:59:50 -0500 Subject: [PATCH] x64: Add support for the BMI2 `mulx` instruction (#9248) This commit adds backend support for the `mulx` instruction in the BMI2 instruction set which supports arbitrary destination registers. This instruction also doesn't read/clobber flags so it can help pipeline instructions in niche situations. --- cranelift/codegen/src/isa/x64/inst.isle | 35 ++++++++ cranelift/codegen/src/isa/x64/inst/emit.rs | 36 ++++++++ cranelift/codegen/src/isa/x64/inst/mod.rs | 36 ++++++++ cranelift/codegen/src/isa/x64/lower.isle | 8 ++ cranelift/codegen/src/isa/x64/lower/isle.rs | 5 ++ cranelift/codegen/src/isa/x64/pcc.rs | 7 ++ .../filetests/filetests/isa/x64/bmi2.clif | 84 +++++++++++++++++++ .../filetests/runtests/i128-arithmetic.clif | 1 + .../filetests/runtests/smul_overflow.clif | 3 +- .../filetests/filetests/runtests/smulhi.clif | 2 + .../filetests/runtests/umul_overflow.clif | 3 +- .../filetests/filetests/runtests/umulhi.clif | 2 + 12 files changed, 220 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 385afe68d481..9b32cc4d64e0 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -99,6 +99,21 @@ (dst_lo WritableGpr) (dst_hi WritableGpr)) + ;; Same as `Mul`, but for the BMI2 `mulx` instruction. This is different + ;; where the two `dst_*` registers can be arbitrary registers and it + ;; is always unsigned multiplication. Note that this instruction does + ;; not modify or read flags. + ;; + ;; Note that `dst_hi` here is always a valid register but `dst_lo` + ;; is allowed to be `invalid_reg` to indicate that only the high + ;; bits are desired. If `dst_lo` is invalid then the instruction itself + ;; will only define `dst_hi`. + (MulX (size OperandSize) + (src1 Gpr) + (src2 GprMem) + (dst_lo WritableGpr) + (dst_hi WritableGpr)) + ;; Same as `Mul` but the 16-bit multiplication result is stored in `AX`. (Mul8 (signed bool) (src1 Gpr) @@ -2636,6 +2651,26 @@ (_ Unit (emit (MInst.Mul size signed src1 src2 dst_lo dst_hi)))) (value_gprs dst_lo dst_hi))) +;; Special case the `mulx` pattern with the BMI2 instruction set. +(rule 1 (x64_mul (ty_32_or_64 ty) $false src1 src2) + (if-let $true (use_bmi2)) + (let ((dst_lo WritableGpr (temp_writable_gpr)) + (dst_hi WritableGpr (temp_writable_gpr)) + (size OperandSize (raw_operand_size_of_type ty)) + (_ Unit (emit (MInst.MulX size src1 src2 dst_lo dst_hi)))) + (value_gprs dst_lo dst_hi))) + +(decl x64_mulx_hi (Type Gpr GprMem) Gpr) +(rule (x64_mulx_hi (ty_32_or_64 ty) src1 src2) + (let ((dst WritableGpr (temp_writable_gpr)) + (size OperandSize (raw_operand_size_of_type ty)) + (_ Unit (emit (MInst.MulX size src1 src2 (writable_invalid_gpr) dst)))) + dst)) + +;; Get the invalid register as writable +(decl writable_invalid_gpr () WritableGpr) +(extern constructor writable_invalid_gpr writable_invalid_gpr) + ;; Helper for creating `mul` instructions or `imul` instructions (depending ;; on `signed`) for 8-bit operands. (decl x64_mul8 (bool Gpr GprMem) Gpr) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 77e0ec45c6bd..68a3b39b77e6 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -682,6 +682,42 @@ pub(crate) fn emit( emit_simm(sink, imm_size, *src2 as u32); } + Inst::MulX { + size, + src1, + src2, + dst_lo, + dst_hi, + } => { + let src1 = src1.to_reg(); + let dst_lo = dst_lo.to_reg().to_reg(); + let dst_hi = dst_hi.to_reg().to_reg(); + debug_assert_eq!(src1, regs::rdx()); + let src2 = match src2.clone().to_reg_mem().clone() { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let dst_hi = dst_hi.to_real_reg().unwrap().hw_enc(); + let dst_lo = if dst_lo.is_invalid_sentinel() { + dst_hi + } else { + dst_lo.to_real_reg().unwrap().hw_enc() + }; + + VexInstruction::new() + .prefix(LegacyPrefixes::_F2) + .map(OpcodeMap::_0F38) + .w(*size == OperandSize::Size64) + .opcode(0xf6) + .reg(dst_hi) + .vvvv(dst_lo) + .rm(src2) + .encode(sink); + } + Inst::SignExtendData { size, src, dst } => { let src = src.to_reg(); let dst = dst.to_reg().to_reg(); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 208cf98ace4d..dc3f15a057e1 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -179,6 +179,8 @@ impl Inst { | Inst::GprToXmmVex { op, .. } | Inst::CvtIntToFloatVex { op, .. } | Inst::XmmCmpRmRVex { op, .. } => op.available_from(), + + Inst::MulX { .. } => smallvec![InstructionSet::BMI2], } } } @@ -823,6 +825,26 @@ impl PrettyPrint for Inst { format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}") } + Inst::MulX { + size, + src1, + src2, + dst_lo, + dst_hi, + } => { + let src1 = pretty_print_reg(src1.to_reg(), size.to_bytes()); + let dst_hi = pretty_print_reg(dst_hi.to_reg().to_reg(), size.to_bytes()); + let dst_lo = if dst_lo.to_reg().is_invalid_sentinel() { + dst_hi.clone() + } else { + pretty_print_reg(dst_lo.to_reg().to_reg(), size.to_bytes()) + }; + let src2 = src2.pretty_print(size.to_bytes()); + let suffix = suffix_bwlq(*size); + let op = ljustify(format!("mulx{suffix}")); + format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}") + } + Inst::Mul8 { signed, src1, @@ -1969,6 +1991,20 @@ fn x64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { collector.reg_def(dst); src1.get_operands(collector); } + Inst::MulX { + src1, + src2, + dst_lo, + dst_hi, + .. + } => { + if !dst_lo.to_reg().is_invalid_sentinel() { + collector.reg_def(dst_lo); + } + collector.reg_def(dst_hi); + collector.reg_fixed_use(src1, regs::rdx()); + src2.get_operands(collector); + } Inst::SignExtendData { size, src, dst } => { match size { OperandSize::Size8 => { diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 55f89f45390a..c42cb37d5f66 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -4354,6 +4354,14 @@ (rule 1 (lower (umulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b)) (value_regs_get_gpr (x64_mul ty $false a b) 1)) +;; The BMI2 instruction set introduced `mulx` which defines two registers but +;; if the two registers are the same then it only defines the upper bits. This +;; helps slightly reduce register pressure by ensuring only one register here is +;; clobbered. +(rule 2 (lower (umulhi a @ (value_type (ty_32_or_64 ty)) b)) + (if-let $true (use_bmi2)) + (x64_mulx_hi ty a b)) + ;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (smulhi a @ (value_type $I8) b)) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 2c49b73acae6..3ed4e4cb4ccc 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -990,6 +990,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { let mask = -1i128 as u128; self.emit_u128_le_const(mask ^ (0xff << (hole_idx * 8))) } + + fn writable_invalid_gpr(&mut self) -> WritableGpr { + let reg = Gpr::new(self.invalid_reg()).unwrap(); + WritableGpr::from_reg(reg) + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/codegen/src/isa/x64/pcc.rs b/cranelift/codegen/src/isa/x64/pcc.rs index 813016165948..326708bf0269 100644 --- a/cranelift/codegen/src/isa/x64/pcc.rs +++ b/cranelift/codegen/src/isa/x64/pcc.rs @@ -252,6 +252,13 @@ pub(crate) fn check( dst_hi, ref src2, .. + } + | Inst::MulX { + size, + dst_lo, + dst_hi, + ref src2, + .. } => { match <&RegMem>::from(src2) { RegMem::Mem { ref addr } => { diff --git a/cranelift/filetests/filetests/isa/x64/bmi2.clif b/cranelift/filetests/filetests/isa/x64/bmi2.clif index 134853ed393e..2d145fe531a8 100644 --- a/cranelift/filetests/filetests/isa/x64/bmi2.clif +++ b/cranelift/filetests/filetests/isa/x64/bmi2.clif @@ -1,4 +1,5 @@ test compile precise-output +set enable_llvm_abi_extensions=true target x86_64 has_bmi2 function %sarx_i32(i32, i32) -> i32 { @@ -346,3 +347,86 @@ block0(v0: i64, v1: i32): ; popq %rbp ; retq +function %extend_to_i128_mul_mulxq(i64, i64) -> i128 { +block0(v0: i64, v1: i64): + v2 = uextend.i128 v0 + v3 = uextend.i128 v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rdx +; mulxq %rdx, %rsi, %rax, %rdx +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rdx +; mulxq %rsi, %rax, %rdx +; movq %rbp, %rsp +; popq %rbp +; retq + +function %umulhi_mulxl(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = umulhi v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rdx +; mulxl %edx, %esi, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rdx +; mulxl %esi, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %umulhi_mulxq(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = umulhi v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rdx +; mulxq %rdx, %rsi, %rax, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rdx +; mulxq %rsi, %rax, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/i128-arithmetic.clif b/cranelift/filetests/filetests/runtests/i128-arithmetic.clif index e0f0a246faa9..5a2703deccac 100644 --- a/cranelift/filetests/filetests/runtests/i128-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/i128-arithmetic.clif @@ -4,6 +4,7 @@ set enable_llvm_abi_extensions=true target aarch64 target s390x target x86_64 +target x86_64 has_bmi2 target riscv64 target riscv64 has_c has_zcb diff --git a/cranelift/filetests/filetests/runtests/smul_overflow.clif b/cranelift/filetests/filetests/runtests/smul_overflow.clif index d61cae79b409..b70b8ca65abd 100644 --- a/cranelift/filetests/filetests/runtests/smul_overflow.clif +++ b/cranelift/filetests/filetests/runtests/smul_overflow.clif @@ -1,6 +1,7 @@ test interpret test run target x86_64 +target x86_64 has_bmi2 target aarch64 function %smulof_i64(i64, i64) -> i64, i8 { @@ -73,4 +74,4 @@ block0(v0: i8, v1: i8): ; run: %smulof_i8(0x7F, 0x7F) == [1, 1] ; run: %smulof_i8(0x80, 0x7F) == [0x80, 1] ; run: %smulof_i8(0x01, 0xFE) == [0xFE, 0] -; run: %smulof_i8(0xC0, 0xDE) == [0x80, 1] \ No newline at end of file +; run: %smulof_i8(0xC0, 0xDE) == [0x80, 1] diff --git a/cranelift/filetests/filetests/runtests/smulhi.clif b/cranelift/filetests/filetests/runtests/smulhi.clif index df9f13adf377..7cc05a1c0f18 100644 --- a/cranelift/filetests/filetests/runtests/smulhi.clif +++ b/cranelift/filetests/filetests/runtests/smulhi.clif @@ -2,6 +2,8 @@ test interpret test run target aarch64 target s390x +target x86_64 +target x86_64 has_bmi2 target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx target riscv64 diff --git a/cranelift/filetests/filetests/runtests/umul_overflow.clif b/cranelift/filetests/filetests/runtests/umul_overflow.clif index a76feb267d07..6f765ab48868 100644 --- a/cranelift/filetests/filetests/runtests/umul_overflow.clif +++ b/cranelift/filetests/filetests/runtests/umul_overflow.clif @@ -1,6 +1,7 @@ test interpret test run target x86_64 +target x86_64 has_bmi2 target aarch64 function %umulof_i64(i64, i64) -> i64, i8 { @@ -65,4 +66,4 @@ block0(v0: i8, v1: i8): ; run: %umulof_i8(0x7F, 0x7F) == [1, 1] ; run: %umulof_i8(0x80, 0x7F) == [0x80, 1] ; run: %umulof_i8(0x01, 0xFE) == [0xFE, 0] -; run: %umulof_i8(0xC0, 0xDE) == [0x80, 1] \ No newline at end of file +; run: %umulof_i8(0xC0, 0xDE) == [0x80, 1] diff --git a/cranelift/filetests/filetests/runtests/umulhi.clif b/cranelift/filetests/filetests/runtests/umulhi.clif index 51d4eee45d92..2d6a1d5c3774 100644 --- a/cranelift/filetests/filetests/runtests/umulhi.clif +++ b/cranelift/filetests/filetests/runtests/umulhi.clif @@ -1,7 +1,9 @@ test interpret test run target aarch64 +target x86_64 target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_bmi2 target s390x target riscv64 target riscv64 has_c has_zcb