Skip to content

Commit f46d5a3

Browse files
authored
x64: Add support for some BMI2 instructions (#6976)
* x64: Add support for sarx, shlx, and shrx These instructions are in the BMI2 instruction set and unconditionally used by LLVM for shifts which don't have an immediate. They're equivalent to the sar, shl, and shr instructions except they use 3-operand form to lessen register allocation pressure. Currently the integration here doesn't add new lowering but instead takes an AVX-like approach by updating the `x64_sar` and related helpers to use `sarx` instead if it fits. This means that the shift-a-value-stored-in-memory functionality of `sarx` and friends isn't exposed, so that's left for a future PR. * x64: Add support for BMI2 `rorx` instruction This is similar to `rol` and `ror` but requires an immediate argument and additionally has no constraints on registers. * x64: Add support for BMI2 `bzhi` instruction This commit adds support for the `bzhi` instruction which is part of BMI2. This instruction is used to zero out the upper bits of a register indexed by a register operand. Emission of this instruction is pattern-matched on CLIF which looks like this pattern. Equivalent code fed to LLVM will additionally generate the `bzhi` instruction. Relative to the alternative lowerings x64 provides this gives a little bit more register freedom and additionally cuts down on a few instructions. Note that the raw functionality of `bzhi` can't be exposed though because the semantics of when the index is out-of-bounds doesn't match easily to a CLIF instruction, so usage of `bzhi` is always preceded by an `and` instruction. This matches LLVM as well, but LLVM probably has fancy logic where if it can prove the range of values of the index it probably elides the `and`. * Pattern match more `x - 1` patterns Looks like LLVM generates this as `x + (-1)` which is equivalent to `x - 1` so create a custom partial constructor to pattern match the possibilities of a decremented value. * Add tests for BMI{1,2} coming from wasm These are intended to serve as integration tests to ensure that even coming from wasm these instructions are all emitted.
1 parent 1a1fc9d commit f46d5a3

File tree

18 files changed

+1113
-46
lines changed

18 files changed

+1113
-46
lines changed

cranelift/codegen/meta/src/isa/x86.rs

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ pub(crate) fn define() -> TargetIsa {
121121

122122
settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
123123
settings.add_predicate("use_bmi1", predicate!(has_bmi1));
124+
settings.add_predicate("use_bmi2", predicate!(has_bmi2));
124125
settings.add_predicate("use_lzcnt", predicate!(has_lzcnt));
125126

126127
let sse3 = settings.add_preset("sse3", "SSE3 and earlier.", preset!(has_sse3));

cranelift/codegen/src/isa/x64/inst.isle

+89-5
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
(AluRmRVex (size OperandSize)
3333
(op AluRmROpcode)
3434
(src1 Gpr)
35-
(src2 Gpr)
35+
(src2 GprMem)
3636
(dst WritableGpr))
3737

3838
;; Production of a zero value into a register of the specified size.
@@ -53,6 +53,13 @@
5353
(src GprMem)
5454
(dst WritableGpr))
5555

56+
;; Same as `UnaryRmRVex` but with an immediate
57+
(UnaryRmRImmVex (size OperandSize)
58+
(op UnaryRmRImmVexOpcode)
59+
(src GprMem)
60+
(dst WritableGpr)
61+
(imm u8))
62+
5663
;; Bitwise not.
5764
(Not (size OperandSize) ;; 1, 2, 4, or 8
5865
(src Gpr)
@@ -746,8 +753,12 @@
746753
Xor
747754
Mul))
748755

749-
(type AluRmROpcode extern
750-
(enum Andn))
756+
(type AluRmROpcode
757+
(enum Andn
758+
Sarx
759+
Shrx
760+
Shlx
761+
Bzhi))
751762

752763
(type UnaryRmROpcode extern
753764
(enum Bsr
@@ -761,6 +772,9 @@
761772
Blsmsk
762773
Blsr))
763774

775+
(type UnaryRmRImmVexOpcode
776+
(enum Rorx))
777+
764778
(type SseOpcode extern
765779
(enum Addps
766780
Addpd
@@ -1433,6 +1447,14 @@
14331447
(decl imm8_reg_to_imm8_gpr (Imm8Reg) Imm8Gpr)
14341448
(extern constructor imm8_reg_to_imm8_gpr imm8_reg_to_imm8_gpr)
14351449

1450+
;; Convert an `Imm8Gpr` into a `Gpr`.
1451+
(decl gpr_from_imm8_gpr (Gpr) Imm8Gpr)
1452+
(extern extractor gpr_from_imm8_gpr gpr_from_imm8_gpr)
1453+
1454+
;; Convert an `Imm8Gpr` into an `Imm8`.
1455+
(decl imm8_from_imm8_gpr (u8) Imm8Gpr)
1456+
(extern extractor imm8_from_imm8_gpr imm8_from_imm8_gpr)
1457+
14361458
;; Convert a `WritableGpr` to a `WritableReg`.
14371459
(decl writable_gpr_to_reg (WritableGpr) WritableReg)
14381460
(extern constructor writable_gpr_to_reg writable_gpr_to_reg)
@@ -1703,6 +1725,9 @@
17031725
(decl pure use_bmi1 () bool)
17041726
(extern constructor use_bmi1 use_bmi1)
17051727

1728+
(decl pure use_bmi2 () bool)
1729+
(extern constructor use_bmi2 use_bmi2)
1730+
17061731
(decl pure use_popcnt () bool)
17071732
(extern constructor use_popcnt use_popcnt)
17081733

@@ -1836,7 +1861,7 @@
18361861
dst))
18371862

18381863
;; Helper for emitting `MInst.AluRmRVex` instructions.
1839-
(decl alu_rm_r_vex (Type AluRmROpcode Gpr Gpr) Gpr)
1864+
(decl alu_rm_r_vex (Type AluRmROpcode Gpr GprMem) Gpr)
18401865
(rule (alu_rm_r_vex ty opcode src1 src2)
18411866
(let ((dst WritableGpr (temp_writable_gpr))
18421867
(size OperandSize (operand_size_of_type_32_64 ty))
@@ -2060,6 +2085,13 @@
20602085
(_ Unit (emit (MInst.UnaryRmRVex size op src dst))))
20612086
dst))
20622087

2088+
;; Helper for creating `MInst.UnaryRmRImmVex` instructions.
2089+
(decl unary_rm_r_imm_vex (UnaryRmRImmVexOpcode GprMem OperandSize u8) Gpr)
2090+
(rule (unary_rm_r_imm_vex op src size imm)
2091+
(let ((dst WritableGpr (temp_writable_gpr))
2092+
(_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
2093+
dst))
2094+
20632095
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
20642096
(rule (cvt_u64_to_float_seq ty src)
20652097
(let ((size OperandSize (raw_operand_size_of_type ty))
@@ -2584,7 +2616,7 @@
25842616
src1
25852617
src2))
25862618

2587-
(decl x64_andn (Type Gpr Gpr) Gpr)
2619+
(decl x64_andn (Type Gpr GprMem) Gpr)
25882620
(rule (x64_andn ty src1 src2)
25892621
(alu_rm_r_vex ty (AluRmROpcode.Andn) src1 src2))
25902622

@@ -2669,26 +2701,55 @@
26692701
(decl x64_rotl (Type Gpr Imm8Gpr) Gpr)
26702702
(rule (x64_rotl ty src1 src2)
26712703
(shift_r ty (ShiftKind.RotateLeft) src1 src2))
2704+
(rule 1 (x64_rotl (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm))
2705+
(if-let $true (use_bmi2))
2706+
(x64_rorx ty src (u8_sub (ty_bits ty) imm)))
26722707

26732708
;; Helper for creating `rotr` instructions.
26742709
(decl x64_rotr (Type Gpr Imm8Gpr) Gpr)
26752710
(rule (x64_rotr ty src1 src2)
26762711
(shift_r ty (ShiftKind.RotateRight) src1 src2))
2712+
(rule 1 (x64_rotr (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm))
2713+
(if-let $true (use_bmi2))
2714+
(x64_rorx ty src imm))
26772715

26782716
;; Helper for creating `shl` instructions.
26792717
(decl x64_shl (Type Gpr Imm8Gpr) Gpr)
26802718
(rule (x64_shl ty src1 src2)
26812719
(shift_r ty (ShiftKind.ShiftLeft) src1 src2))
2720+
;; With BMI2 the `shlx` instruction is also available, and it's unconditionally
2721+
;; used for registers shifted by registers since it provides more freedom
2722+
;; in regalloc since nothing is constrained. Note that the `shlx` instruction
2723+
;; doesn't encode an immediate so any immediate-based shift still uses `shl`.
2724+
(rule 1 (x64_shl (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
2725+
(if-let $true (use_bmi2))
2726+
(x64_shlx ty src1 src2))
26822727

26832728
;; Helper for creating logical shift-right instructions.
26842729
(decl x64_shr (Type Gpr Imm8Gpr) Gpr)
26852730
(rule (x64_shr ty src1 src2)
26862731
(shift_r ty (ShiftKind.ShiftRightLogical) src1 src2))
2732+
;; see `x64_shl` for more info about this rule
2733+
(rule 1 (x64_shr (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
2734+
(if-let $true (use_bmi2))
2735+
(x64_shrx ty src1 src2))
26872736

26882737
;; Helper for creating arithmetic shift-right instructions.
26892738
(decl x64_sar (Type Gpr Imm8Gpr) Gpr)
26902739
(rule (x64_sar ty src1 src2)
26912740
(shift_r ty (ShiftKind.ShiftRightArithmetic) src1 src2))
2741+
;; see `x64_shl` for more info about this rule
2742+
(rule 1 (x64_sar (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
2743+
(if-let $true (use_bmi2))
2744+
(x64_sarx ty src1 src2))
2745+
2746+
;; Helper for creating zeroing-of-high-bits instructions bzhi
2747+
;;
2748+
;; Note that the `src` operands are swapped here. The amount-to-shift-by
2749+
;; is stored in `vvvv` which is `src1` in the `AluRmRVex` instruction shape.
2750+
(decl x64_bzhi (Type GprMem Gpr) Gpr)
2751+
(rule (x64_bzhi ty src1 src2)
2752+
(alu_rm_r_vex ty (AluRmROpcode.Bzhi) src2 src1))
26922753

26932754
;; Helper for creating byteswap instructions.
26942755
;; In x64, 32- and 64-bit registers use BSWAP instruction, and
@@ -4096,6 +4157,29 @@
40964157
(rule (x64_blsr ty src)
40974158
(unary_rm_r_vex (UnaryRmRVexOpcode.Blsr) src (operand_size_of_type_32_64 ty)))
40984159

4160+
;; Helper for creating `sarx` instructions.
4161+
(decl x64_sarx (Type GprMem Gpr) Gpr)
4162+
(rule (x64_sarx ty val amt)
4163+
(alu_rm_r_vex ty (AluRmROpcode.Sarx) amt val))
4164+
4165+
;; Helper for creating `shrx` instructions.
4166+
(decl x64_shrx (Type GprMem Gpr) Gpr)
4167+
(rule (x64_shrx ty val amt)
4168+
(alu_rm_r_vex ty (AluRmROpcode.Shrx) amt val))
4169+
4170+
;; Helper for creating `shlx` instructions.
4171+
(decl x64_shlx (Type GprMem Gpr) Gpr)
4172+
(rule (x64_shlx ty val amt)
4173+
(alu_rm_r_vex ty (AluRmROpcode.Shlx) amt val))
4174+
4175+
;; Helper for creating `rorx` instructions.
4176+
(decl x64_rorx (Type GprMem u8) Gpr)
4177+
(rule (x64_rorx ty src imm)
4178+
(unary_rm_r_imm_vex (UnaryRmRImmVexOpcode.Rorx)
4179+
src
4180+
(operand_size_of_type_32_64 ty)
4181+
imm))
4182+
40994183
;; Helper for creating `popcnt` instructions.
41004184
(decl x64_popcnt (Type Gpr) Gpr)
41014185
(rule (x64_popcnt ty src)

cranelift/codegen/src/isa/x64/inst/args.rs

+23-16
Original file line numberDiff line numberDiff line change
@@ -826,33 +826,22 @@ impl fmt::Display for AluRmiROpcode {
826826
}
827827
}
828828

829-
/// ALU operations that don't accept intermediates.
830-
#[derive(Copy, Clone, PartialEq)]
831-
pub enum AluRmROpcode {
832-
/// And with negated second operand.
833-
Andn,
834-
}
829+
pub use crate::isa::x64::lower::isle::generated_code::AluRmROpcode;
835830

836831
impl AluRmROpcode {
837832
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
838833
match self {
839834
AluRmROpcode::Andn => smallvec![InstructionSet::BMI1],
835+
AluRmROpcode::Sarx | AluRmROpcode::Shrx | AluRmROpcode::Shlx | AluRmROpcode::Bzhi => {
836+
smallvec![InstructionSet::BMI2]
837+
}
840838
}
841839
}
842840
}
843841

844-
impl fmt::Debug for AluRmROpcode {
845-
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
846-
let name = match self {
847-
AluRmROpcode::Andn => "andn",
848-
};
849-
write!(fmt, "{}", name)
850-
}
851-
}
852-
853842
impl fmt::Display for AluRmROpcode {
854843
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
855-
fmt::Debug::fmt(self, f)
844+
f.write_str(&format!("{self:?}").to_lowercase())
856845
}
857846
}
858847

@@ -918,6 +907,24 @@ impl fmt::Display for UnaryRmRVexOpcode {
918907
}
919908
}
920909

910+
pub use crate::isa::x64::lower::isle::generated_code::UnaryRmRImmVexOpcode;
911+
912+
impl UnaryRmRImmVexOpcode {
913+
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
914+
match self {
915+
UnaryRmRImmVexOpcode::Rorx => {
916+
smallvec![InstructionSet::BMI2]
917+
}
918+
}
919+
}
920+
}
921+
922+
impl fmt::Display for UnaryRmRImmVexOpcode {
923+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
924+
f.write_str(&format!("{self:?}").to_lowercase())
925+
}
926+
}
927+
921928
#[derive(Clone, Copy, PartialEq)]
922929
/// Comparison operations.
923930
pub enum CmpOpcode {

cranelift/codegen/src/isa/x64/inst/emit.rs

+46-4
Original file line numberDiff line numberDiff line change
@@ -358,9 +358,16 @@ pub(crate) fn emit(
358358
src2,
359359
} => {
360360
use AluRmROpcode::*;
361+
use LegacyPrefixes as LP;
362+
361363
let dst = allocs.next(dst.to_reg().to_reg());
362364
let src1 = allocs.next(src1.to_reg());
363-
let src2 = allocs.next(src2.to_reg());
365+
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
366+
RegMem::Reg { reg } => {
367+
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
368+
}
369+
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
370+
};
364371

365372
let w = match size {
366373
OperandSize::Size32 => false,
@@ -370,16 +377,21 @@ pub(crate) fn emit(
370377
_ => unreachable!(),
371378
};
372379

373-
let opcode = match op {
374-
Andn => 0xf2,
380+
let (prefix, opcode) = match op {
381+
Andn => (LP::None, 0xf2),
382+
Sarx => (LP::_F3, 0xf7),
383+
Shrx => (LP::_F2, 0xf7),
384+
Shlx => (LP::_66, 0xf7),
385+
Bzhi => (LP::None, 0xf5),
375386
};
376387

377388
VexInstruction::new()
389+
.prefix(prefix)
378390
.map(OpcodeMap::_0F38)
379391
.w(w)
380392
.reg(dst.to_real_reg().unwrap().hw_enc())
381393
.vvvv(src1.to_real_reg().unwrap().hw_enc())
382-
.rm(src2.to_real_reg().unwrap().hw_enc())
394+
.rm(src2)
383395
.opcode(opcode)
384396
.encode(sink);
385397
}
@@ -445,6 +457,36 @@ pub(crate) fn emit(
445457
.encode(sink);
446458
}
447459

460+
Inst::UnaryRmRImmVex {
461+
size,
462+
op,
463+
src,
464+
dst,
465+
imm,
466+
} => {
467+
let dst = allocs.next(dst.to_reg().to_reg());
468+
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
469+
RegMem::Reg { reg } => {
470+
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
471+
}
472+
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
473+
};
474+
475+
let opcode = match op {
476+
UnaryRmRImmVexOpcode::Rorx => 0xF0,
477+
};
478+
479+
VexInstruction::new()
480+
.prefix(LegacyPrefixes::_F2)
481+
.map(OpcodeMap::_0F3A)
482+
.w(*size == OperandSize::Size64)
483+
.opcode(opcode)
484+
.reg(dst.to_real_reg().unwrap().hw_enc())
485+
.rm(src)
486+
.imm(*imm)
487+
.encode(sink);
488+
}
489+
448490
Inst::Not { size, src, dst } => {
449491
let src = allocs.next(src.to_reg());
450492
let dst = allocs.next(dst.to_reg().to_reg());

0 commit comments

Comments
 (0)