Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Lower vany_true, vall_true, vhigh_bits, iconcat, and isplit in ISLE #4787

Merged
merged 4 commits into from
Aug 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1521,6 +1521,13 @@

;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Turn a vector type into its integer-typed vector equivalent.
(decl vec_int_type (Type) Type)
(rule (vec_int_type (multi_lane 8 16)) $I8X16)
(rule (vec_int_type (multi_lane 16 8)) $I16X8)
(rule (vec_int_type (multi_lane 32 4)) $I32X4)
(rule (vec_int_type (multi_lane 64 2)) $I64X2)

;; Determine the appropriate operation for xor-ing vectors of the specified type
(decl sse_xor_op (Type) SseOpcode)
(rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
Expand Down Expand Up @@ -2021,6 +2028,11 @@
(rule (x64_test size src1 src2)
(cmp_rmi_r size (CmpOpcode.Test) src1 src2))

;; Helper for creating `ptest` instructions.
(decl x64_ptest (XmmMem Xmm) ProducesFlags)
(rule (x64_ptest src1 src2)
(xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2))

;; Helper for creating `cmove` instructions. Note that these instructions do not
;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps
;; to conditionally move the selected value into an XMM register.
Expand Down Expand Up @@ -2889,6 +2901,21 @@
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
dst))

;; Helper for creating `pmovmskb` instructions.
(decl x64_pmovmskb (OperandSize Xmm) Gpr)
(rule (x64_pmovmskb size src)
(xmm_to_gpr (SseOpcode.Pmovmskb) src size))

;; Helper for creating `movmskps` instructions.
(decl x64_movmskps (OperandSize Xmm) Gpr)
(rule (x64_movmskps size src)
(xmm_to_gpr (SseOpcode.Movmskps) src size))

;; Helper for creating `movmskpd` instructions.
(decl x64_movmskpd (OperandSize Xmm) Gpr)
(rule (x64_movmskpd size src)
(xmm_to_gpr (SseOpcode.Movmskpd) src size))

;; Helper for creating `MInst.GprToXmm` instructions.
(decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
(rule (gpr_to_xmm op src size)
Expand Down
6 changes: 6 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ impl Inst {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
debug_assert!(dst.to_reg().class() == RegClass::Int);
let dst = WritableGpr::from_writable_reg(dst).unwrap();
Inst::Setcc { cc, dst }
}
}

#[test]
Expand Down
6 changes: 0 additions & 6 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -478,12 +478,6 @@ impl Inst {
Inst::Ud2 { trap_code }
}

pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
debug_assert!(dst.to_reg().class() == RegClass::Int);
let dst = WritableGpr::from_writable_reg(dst).unwrap();
Inst::Setcc { cc, dst }
}

pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
debug_assert!(size.is_one_of(&[
OperandSize::Size16,
Expand Down
58 changes: 58 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3643,3 +3643,61 @@
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
(vec_insert_lane ty vec src 1)))

;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (vany_true val))
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))

;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (vall_true val @ (value_type ty)))
(let ((src Xmm val)
(zeros Xmm (x64_pxor src src))
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))

;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; The Intel specification allows using both 32-bit and 64-bit GPRs as
;; destination for the "move mask" instructions. This is controlled by the REX.R
;; bit: "In 64-bit mode, the instruction can access additional registers when
;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
;; for setting/clearing REX.W) as we need at most 16 bits of output for
;; `vhigh_bits`.

(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16))))
(x64_pmovmskb (OperandSize.Size32) val))

(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4))))
(x64_movmskps (OperandSize.Size32) val))

(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2))))
(x64_movmskpd (OperandSize.Size32) val))

;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
;; here we:
;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
;; - shift away the bottom 8 high bits to remove the duplicates.
(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8))))
(let ((src Xmm val)
(tmp Xmm (x64_packsswb src src))
(tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp)))
(x64_shr $I64 tmp (Imm8Reg.Imm8 8))))

;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (iconcat lo @ (value_type $I64) hi))
(value_regs lo hi))

;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (isplit val @ (value_type $I128)))
(let ((regs ValueRegs val)
(lo Reg (value_regs_get regs 0))
(hi Reg (value_regs_get regs 1)))
(output_pair lo hi)))
157 changes: 6 additions & 151 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,32 +129,6 @@ fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInp
}
}

/// Put the given input into a register or a memory operand.
/// Effectful: may mark the given input as used, when returning the register form.
fn input_to_reg_mem(ctx: &mut Lower<Inst>, spec: InsnInput) -> RegMem {
let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);

if let Some(c) = inputs.constant {
// Generate constants fresh at each use to minimize long-range register pressure.
let ty = ctx.input_ty(spec.insn, spec.input);
return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
}

if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
ctx.sink_inst(src_insn);
let amode = lower_to_amode(ctx, addr_input, offset);
return RegMem::mem(amode);
}
}

RegMem::reg(
ctx.put_input_in_regs(spec.insn, spec.input)
.only_reg()
.unwrap(),
)
}

fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
ctx.get_input_as_source_or_const(spec.insn, spec.input)
.constant
Expand Down Expand Up @@ -495,136 +469,17 @@ fn lower_insn_to_regs(
| Opcode::Swizzle
| Opcode::Extractlane
| Opcode::ScalarToVector
| Opcode::Splat => {
| Opcode::Splat
| Opcode::VanyTrue
| Opcode::VallTrue
| Opcode::VhighBits
| Opcode::Iconcat
| Opcode::Isplit => {
implemented_in_isle(ctx);
}

Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

Opcode::VanyTrue => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let src_ty = ctx.input_ty(insn, 0);
assert_eq!(src_ty.bits(), 128);
let src = put_input_in_reg(ctx, inputs[0]);
// Set the ZF if the result is all zeroes.
ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
// If the ZF is not set, place a 1 in `dst`.
ctx.emit(Inst::setcc(CC::NZ, dst));
}

Opcode::VallTrue => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let src_ty = ctx.input_ty(insn, 0);
assert_eq!(src_ty.bits(), 128);
let src = input_to_reg_mem(ctx, inputs[0]);

let eq = |ty: Type| match ty.lane_bits() {
8 => SseOpcode::Pcmpeqb,
16 => SseOpcode::Pcmpeqw,
32 => SseOpcode::Pcmpeqd,
64 => SseOpcode::Pcmpeqq,
_ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
};

// Initialize a register with all 0s.
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
// Compare to see what lanes are filled with all 1s.
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
// Set the ZF if the result is all zeroes.
ctx.emit(Inst::xmm_cmp_rm_r(
SseOpcode::Ptest,
RegMem::from(tmp),
tmp.to_reg(),
));
// If the ZF is set, place a 1 in `dst`.
ctx.emit(Inst::setcc(CC::Z, dst));
}

Opcode::VhighBits => {
let src = put_input_in_reg(ctx, inputs[0]);
let src_ty = ctx.input_ty(insn, 0);
debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
debug_assert!(dst.to_reg().class() == RegClass::Int);

// The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
// the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
// the instruction can access additional registers when used with a REX.R prefix. The
// default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
// Manual, vol. 2). This being the case, we will always clear REX.W since its use is
// unnecessary (`OperandSize` is used for setting/clearing REX.W).
let size = OperandSize::Size32;

match src_ty {
types::I8X16 | types::B8X16 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
}
types::I32X4 | types::B32X4 | types::F32X4 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
}
types::I64X2 | types::B64X2 | types::F64X2 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
}
types::I16X8 | types::B16X8 => {
// There is no x86 instruction for extracting the high bit of 16-bit lanes so
// here we:
// - duplicate the 16-bit lanes of `src` into 8-bit lanes:
// PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
// - use PMOVMSKB to gather the high bits; now we have duplicates, though
// - shift away the bottom 8 high bits to remove the duplicates.
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp, src, src_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
ctx.emit(Inst::xmm_to_gpr(
SseOpcode::Pmovmskb,
tmp.to_reg(),
dst,
size,
));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftRightLogical,
Some(8),
dst,
));
}
_ => unimplemented!("unknown input type {} for {}", src_ty, op),
}
}

Opcode::Iconcat => {
let ty = ctx.output_ty(insn, 0);
assert_eq!(
ty,
types::I128,
"Iconcat not expected to be used for non-128-bit type"
);
assert_eq!(ctx.input_ty(insn, 0), types::I64);
assert_eq!(ctx.input_ty(insn, 1), types::I64);
let lo = put_input_in_reg(ctx, inputs[0]);
let hi = put_input_in_reg(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
}

Opcode::Isplit => {
let ty = ctx.input_ty(insn, 0);
assert_eq!(
ty,
types::I128,
"Isplit not expected to be used for non-128-bit type"
);
assert_eq!(ctx.output_ty(insn, 0), types::I64);
assert_eq!(ctx.output_ty(insn, 1), types::I64);
let src = put_input_in_regs(ctx, inputs[0]);
let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
}

Opcode::TlsValue => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let (name, _, _) = ctx.symbol_value(insn).unwrap();
Expand Down
12 changes: 6 additions & 6 deletions cranelift/filetests/filetests/isa/x64/ishl.clif
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@ block0(v0: i128, v1: i8):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq %dl, %rax
; movq %rax, %rcx
; movzbq %dl, %rcx
; movq %rdi, %rdx
; shlq %cl, %rdx, %rdx
; shlq %cl, %rsi, %rsi
; movq %rcx, %r8
; movq %rcx, %rax
; movl $64, %ecx
; subq %rcx, %r8, %rcx
; movq %rax, %r10
; subq %rcx, %r10, %rcx
; shrq %cl, %rdi, %rdi
; xorq %rax, %rax, %rax
; testq $127, %r8
; testq $127, %r10
; cmovzq %rax, %rdi, %rdi
; orq %rdi, %rsi, %rdi
; testq $64, %r8
; testq $64, %r10
; cmovzq %rdx, %rax, %rax
; cmovzq %rdi, %rdx, %rdx
; movq %rbp, %rsp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ block0(v0: i64x2):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pxor %xmm4, %xmm4, %xmm4
; pcmpeqq %xmm4, %xmm0, %xmm4
; ptest %xmm4, %xmm4
; pxor %xmm3, %xmm3, %xmm3
; pcmpeqq %xmm0, %xmm3, %xmm0
; ptest %xmm0, %xmm0
; setz %al
; movq %rbp, %rsp
; popq %rbp
Expand Down
29 changes: 15 additions & 14 deletions cranelift/filetests/filetests/isa/x64/sshr.clif
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,25 @@ block0(v0: i128, v1: i8):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq %dl, %rdx
; movq %rdx, %rcx
; movzbq %dl, %rcx
; shrq %cl, %rdi, %rdi
; movq %rsi, %r9
; sarq %cl, %r9, %r9
; movq %rsi, %rdx
; sarq %cl, %rdx, %rdx
; movq %rcx, %rax
; movl $64, %ecx
; subq %rcx, %rdx, %rcx
; movq %rsi, %r8
; shlq %cl, %r8, %r8
; xorq %r10, %r10, %r10
; testq $127, %rdx
; cmovzq %r10, %r8, %r8
; orq %rdi, %r8, %rdi
; movq %rax, %r11
; subq %rcx, %r11, %rcx
; movq %rsi, %rax
; shlq %cl, %rax, %rax
; xorq %r8, %r8, %r8
; testq $127, %r11
; cmovzq %r8, %rax, %rax
; orq %rdi, %rax, %rdi
; sarq $63, %rsi, %rsi
; testq $64, %rdx
; movq %r9, %rax
; testq $64, %r11
; movq %rdx, %rax
; cmovzq %rdi, %rax, %rax
; cmovzq %r9, %rsi, %rsi
; cmovzq %rdx, %rsi, %rsi
; movq %rsi, %rdx
; movq %rbp, %rsp
; popq %rbp
Expand Down
Loading