Skip to content

Commit

Permalink
x64: Add support for the BMI2 mulx instruction (#9248)
Browse files Browse the repository at this point in the history
This commit adds backend support for the `mulx` instruction in the BMI2
instruction set which supports arbitrary destination registers. This
instruction also doesn't read/clobber flags so it can help pipeline
instructions in niche situations.
  • Loading branch information
alexcrichton authored Sep 13, 2024
1 parent b402cea commit 8f94133
Show file tree
Hide file tree
Showing 12 changed files with 220 additions and 2 deletions.
35 changes: 35 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,21 @@
(dst_lo WritableGpr)
(dst_hi WritableGpr))

;; Same as `Mul`, but for the BMI2 `mulx` instruction. This is different
;; where the two `dst_*` registers can be arbitrary registers and it
;; is always unsigned multiplication. Note that this instruction does
;; not modify or read flags.
;;
;; Note that `dst_hi` here is always a valid register but `dst_lo`
;; is allowed to be `invalid_reg` to indicate that only the high
;; bits are desired. If `dst_lo` is invalid then the instruction itself
;; will only define `dst_hi`.
(MulX (size OperandSize)
(src1 Gpr)
(src2 GprMem)
(dst_lo WritableGpr)
(dst_hi WritableGpr))

;; Same as `Mul` but the 16-bit multiplication result is stored in `AX`.
(Mul8 (signed bool)
(src1 Gpr)
Expand Down Expand Up @@ -2636,6 +2651,26 @@
(_ Unit (emit (MInst.Mul size signed src1 src2 dst_lo dst_hi))))
(value_gprs dst_lo dst_hi)))

;; Special case the `mulx` pattern with the BMI2 instruction set.
(rule 1 (x64_mul (ty_32_or_64 ty) $false src1 src2)
(if-let $true (use_bmi2))
(let ((dst_lo WritableGpr (temp_writable_gpr))
(dst_hi WritableGpr (temp_writable_gpr))
(size OperandSize (raw_operand_size_of_type ty))
(_ Unit (emit (MInst.MulX size src1 src2 dst_lo dst_hi))))
(value_gprs dst_lo dst_hi)))

(decl x64_mulx_hi (Type Gpr GprMem) Gpr)
(rule (x64_mulx_hi (ty_32_or_64 ty) src1 src2)
(let ((dst WritableGpr (temp_writable_gpr))
(size OperandSize (raw_operand_size_of_type ty))
(_ Unit (emit (MInst.MulX size src1 src2 (writable_invalid_gpr) dst))))
dst))

;; Get the invalid register as writable
(decl writable_invalid_gpr () WritableGpr)
(extern constructor writable_invalid_gpr writable_invalid_gpr)

;; Helper for creating `mul` instructions or `imul` instructions (depending
;; on `signed`) for 8-bit operands.
(decl x64_mul8 (bool Gpr GprMem) Gpr)
Expand Down
36 changes: 36 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,42 @@ pub(crate) fn emit(
emit_simm(sink, imm_size, *src2 as u32);
}

Inst::MulX {
size,
src1,
src2,
dst_lo,
dst_hi,
} => {
let src1 = src1.to_reg();
let dst_lo = dst_lo.to_reg().to_reg();
let dst_hi = dst_hi.to_reg().to_reg();
debug_assert_eq!(src1, regs::rdx());
let src2 = match src2.clone().to_reg_mem().clone() {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};

let dst_hi = dst_hi.to_real_reg().unwrap().hw_enc();
let dst_lo = if dst_lo.is_invalid_sentinel() {
dst_hi
} else {
dst_lo.to_real_reg().unwrap().hw_enc()
};

VexInstruction::new()
.prefix(LegacyPrefixes::_F2)
.map(OpcodeMap::_0F38)
.w(*size == OperandSize::Size64)
.opcode(0xf6)
.reg(dst_hi)
.vvvv(dst_lo)
.rm(src2)
.encode(sink);
}

Inst::SignExtendData { size, src, dst } => {
let src = src.to_reg();
let dst = dst.to_reg().to_reg();
Expand Down
36 changes: 36 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ impl Inst {
| Inst::GprToXmmVex { op, .. }
| Inst::CvtIntToFloatVex { op, .. }
| Inst::XmmCmpRmRVex { op, .. } => op.available_from(),

Inst::MulX { .. } => smallvec![InstructionSet::BMI2],
}
}
}
Expand Down Expand Up @@ -823,6 +825,26 @@ impl PrettyPrint for Inst {
format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}")
}

Inst::MulX {
size,
src1,
src2,
dst_lo,
dst_hi,
} => {
let src1 = pretty_print_reg(src1.to_reg(), size.to_bytes());
let dst_hi = pretty_print_reg(dst_hi.to_reg().to_reg(), size.to_bytes());
let dst_lo = if dst_lo.to_reg().is_invalid_sentinel() {
dst_hi.clone()
} else {
pretty_print_reg(dst_lo.to_reg().to_reg(), size.to_bytes())
};
let src2 = src2.pretty_print(size.to_bytes());
let suffix = suffix_bwlq(*size);
let op = ljustify(format!("mulx{suffix}"));
format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}")
}

Inst::Mul8 {
signed,
src1,
Expand Down Expand Up @@ -1969,6 +1991,20 @@ fn x64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
collector.reg_def(dst);
src1.get_operands(collector);
}
Inst::MulX {
src1,
src2,
dst_lo,
dst_hi,
..
} => {
if !dst_lo.to_reg().is_invalid_sentinel() {
collector.reg_def(dst_lo);
}
collector.reg_def(dst_hi);
collector.reg_fixed_use(src1, regs::rdx());
src2.get_operands(collector);
}
Inst::SignExtendData { size, src, dst } => {
match size {
OperandSize::Size8 => {
Expand Down
8 changes: 8 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -4354,6 +4354,14 @@
(rule 1 (lower (umulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b))
(value_regs_get_gpr (x64_mul ty $false a b) 1))

;; The BMI2 instruction set introduced `mulx` which defines two registers but
;; if the two registers are the same then it only defines the upper bits. This
;; helps slightly reduce register pressure by ensuring only one register here is
;; clobbered.
(rule 2 (lower (umulhi a @ (value_type (ty_32_or_64 ty)) b))
(if-let $true (use_bmi2))
(x64_mulx_hi ty a b))

;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (smulhi a @ (value_type $I8) b))
Expand Down
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isa/x64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
let mask = -1i128 as u128;
self.emit_u128_le_const(mask ^ (0xff << (hole_idx * 8)))
}

fn writable_invalid_gpr(&mut self) -> WritableGpr {
let reg = Gpr::new(self.invalid_reg()).unwrap();
WritableGpr::from_reg(reg)
}
}

impl IsleContext<'_, '_, MInst, X64Backend> {
Expand Down
7 changes: 7 additions & 0 deletions cranelift/codegen/src/isa/x64/pcc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,13 @@ pub(crate) fn check(
dst_hi,
ref src2,
..
}
| Inst::MulX {
size,
dst_lo,
dst_hi,
ref src2,
..
} => {
match <&RegMem>::from(src2) {
RegMem::Mem { ref addr } => {
Expand Down
84 changes: 84 additions & 0 deletions cranelift/filetests/filetests/isa/x64/bmi2.clif
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64 has_bmi2

function %sarx_i32(i32, i32) -> i32 {
Expand Down Expand Up @@ -346,3 +347,86 @@ block0(v0: i64, v1: i32):
; popq %rbp
; retq

function %extend_to_i128_mul_mulxq(i64, i64) -> i128 {
block0(v0: i64, v1: i64):
v2 = uextend.i128 v0
v3 = uextend.i128 v1
v4 = imul v2, v3
return v4
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rdx
; mulxq %rdx, %rsi, %rax, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rdx
; mulxq %rsi, %rax, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq

function %umulhi_mulxl(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = umulhi v0, v1
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rdx
; mulxl %edx, %esi, %eax, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rdx
; mulxl %esi, %eax, %eax
; movq %rbp, %rsp
; popq %rbp
; retq

function %umulhi_mulxq(i64, i64) -> i64 {
block0(v0: i64, v1: i64):
v2 = umulhi v0, v1
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rdx
; mulxq %rdx, %rsi, %rax, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rdx
; mulxq %rsi, %rax, %rax
; movq %rbp, %rsp
; popq %rbp
; retq

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set enable_llvm_abi_extensions=true
target aarch64
target s390x
target x86_64
target x86_64 has_bmi2
target riscv64
target riscv64 has_c has_zcb

Expand Down
3 changes: 2 additions & 1 deletion cranelift/filetests/filetests/runtests/smul_overflow.clif
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test interpret
test run
target x86_64
target x86_64 has_bmi2
target aarch64

function %smulof_i64(i64, i64) -> i64, i8 {
Expand Down Expand Up @@ -73,4 +74,4 @@ block0(v0: i8, v1: i8):
; run: %smulof_i8(0x7F, 0x7F) == [1, 1]
; run: %smulof_i8(0x80, 0x7F) == [0x80, 1]
; run: %smulof_i8(0x01, 0xFE) == [0xFE, 0]
; run: %smulof_i8(0xC0, 0xDE) == [0x80, 1]
; run: %smulof_i8(0xC0, 0xDE) == [0x80, 1]
2 changes: 2 additions & 0 deletions cranelift/filetests/filetests/runtests/smulhi.clif
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ test interpret
test run
target aarch64
target s390x
target x86_64
target x86_64 has_bmi2
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target riscv64
Expand Down
3 changes: 2 additions & 1 deletion cranelift/filetests/filetests/runtests/umul_overflow.clif
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test interpret
test run
target x86_64
target x86_64 has_bmi2
target aarch64

function %umulof_i64(i64, i64) -> i64, i8 {
Expand Down Expand Up @@ -65,4 +66,4 @@ block0(v0: i8, v1: i8):
; run: %umulof_i8(0x7F, 0x7F) == [1, 1]
; run: %umulof_i8(0x80, 0x7F) == [0x80, 1]
; run: %umulof_i8(0x01, 0xFE) == [0xFE, 0]
; run: %umulof_i8(0xC0, 0xDE) == [0x80, 1]
; run: %umulof_i8(0xC0, 0xDE) == [0x80, 1]
2 changes: 2 additions & 0 deletions cranelift/filetests/filetests/runtests/umulhi.clif
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
test interpret
test run
target aarch64
target x86_64
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_bmi2
target s390x
target riscv64
target riscv64 has_c has_zcb
Expand Down

0 comments on commit 8f94133

Please sign in to comment.