bytecodealliance · elliottt · Aug 26, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022
@@ -1521,6 +1521,13 @@
 
 ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; Turn a vector type into its integer-typed vector equivalent.
+(decl vec_int_type (Type) Type)
+(rule (vec_int_type (multi_lane 8 16)) $I8X16)
+(rule (vec_int_type (multi_lane 16 8)) $I16X8)
+(rule (vec_int_type (multi_lane 32 4)) $I32X4)
+(rule (vec_int_type (multi_lane 64 2)) $I64X2)
+
 ;; Determine the appropriate operation for xor-ing vectors of the specified type
 (decl sse_xor_op (Type) SseOpcode)
 (rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
@@ -2021,6 +2028,11 @@
 (rule (x64_test size src1 src2)
  (cmp_rmi_r size (CmpOpcode.Test) src1 src2))
 
+;; Helper for creating `ptest` instructions.
+(decl x64_ptest (XmmMem Xmm) ProducesFlags)
+(rule (x64_ptest src1 src2)
+ (xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2))
+
 ;; Helper for creating `cmove` instructions. Note that these instructions do not
 ;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps
 ;; to conditionally move the selected value into an XMM register.
@@ -2889,6 +2901,21 @@
  (_ Unit (emit (MInst.XmmToGpr op src dst size))))
  dst))
 
+;; Helper for creating `pmovmskb` instructions.
+(decl x64_pmovmskb (OperandSize Xmm) Gpr)
+(rule (x64_pmovmskb size src)
+ (xmm_to_gpr (SseOpcode.Pmovmskb) src size))
+
+;; Helper for creating `movmskps` instructions.
+(decl x64_movmskps (OperandSize Xmm) Gpr)
+(rule (x64_movmskps size src)
+ (xmm_to_gpr (SseOpcode.Movmskps) src size))
+
+;; Helper for creating `movmskpd` instructions.
+(decl x64_movmskpd (OperandSize Xmm) Gpr)
+(rule (x64_movmskpd size src)
+ (xmm_to_gpr (SseOpcode.Movmskpd) src size))
+
 ;; Helper for creating `MInst.GprToXmm` instructions.
 (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
 (rule (gpr_to_xmm op src size)

@@ -89,6 +89,12 @@ impl Inst {
  dst: WritableXmm::from_writable_reg(dst).unwrap(),
  }
  }
+
+ fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
+ debug_assert!(dst.to_reg().class() == RegClass::Int);
+ let dst = WritableGpr::from_writable_reg(dst).unwrap();
+ Inst::Setcc { cc, dst }
+ }
 }
 
 #[test]

@@ -478,12 +478,6 @@ impl Inst {
  Inst::Ud2 { trap_code }
  }
 
- pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
- debug_assert!(dst.to_reg().class() == RegClass::Int);
- let dst = WritableGpr::from_writable_reg(dst).unwrap();
- Inst::Setcc { cc, dst }
- }
-
  pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
  debug_assert!(size.is_one_of(&[
  OperandSize::Size16,

@@ -3643,3 +3643,61 @@
  (src RegMem (RegMem.Reg src))
  (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
  (vec_insert_lane ty vec src 1)))
+
+;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (vany_true val))
+ (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))
+
+;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (vall_true val @ (value_type ty)))
+ (let ((src Xmm val)
+ (zeros Xmm (x64_pxor src src))
+ (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
+ (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
+
+;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The Intel specification allows using both 32-bit and 64-bit GPRs as
+;; destination for the "move mask" instructions. This is controlled by the REX.R
+;; bit: "In 64-bit mode, the instruction can access additional registers when
+;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
+;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
+;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
+;; for setting/clearing REX.W) as we need at most 16 bits of output for
+;; `vhigh_bits`.
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16))))
+ (x64_pmovmskb (OperandSize.Size32) val))
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4))))
+ (x64_movmskps (OperandSize.Size32) val))
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2))))
+ (x64_movmskpd (OperandSize.Size32) val))
+
+;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
+;; here we:
+;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
+;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
+;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
+;; - shift away the bottom 8 high bits to remove the duplicates.
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8))))
+ (let ((src Xmm val)
+ (tmp Xmm (x64_packsswb src src))
+ (tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp)))
+ (x64_shr $I64 tmp (Imm8Reg.Imm8 8))))
+
+;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (iconcat lo @ (value_type $I64) hi))
+ (value_regs lo hi))
+
+;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (isplit val @ (value_type $I128)))
+ (let ((regs ValueRegs val)
+ (lo Reg (value_regs_get regs 0))
+ (hi Reg (value_regs_get regs 1)))
+ (output_pair lo hi)))
@@ -129,32 +129,6 @@ fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInp
  }
 }
 
-/// Put the given input into a register or a memory operand.
-/// Effectful: may mark the given input as used, when returning the register form.
-fn input_to_reg_mem(ctx: &mut Lower<Inst>, spec: InsnInput) -> RegMem {
- let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
-
- if let Some(c) = inputs.constant {
- // Generate constants fresh at each use to minimize long-range register pressure.
- let ty = ctx.input_ty(spec.insn, spec.input);
- return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
- }
-
- if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
- if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
- ctx.sink_inst(src_insn);
- let amode = lower_to_amode(ctx, addr_input, offset);
- return RegMem::mem(amode);
- }
- }
-
- RegMem::reg(
- ctx.put_input_in_regs(spec.insn, spec.input)
- .only_reg()
- .unwrap(),
- )
-}
-
 fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
  ctx.get_input_as_source_or_const(spec.insn, spec.input)
  .constant
@@ -495,136 +469,17 @@ fn lower_insn_to_regs(
  | Opcode::Swizzle
  | Opcode::Extractlane
  | Opcode::ScalarToVector
- | Opcode::Splat => {
+ | Opcode::Splat
+ | Opcode::VanyTrue
+ | Opcode::VallTrue
+ | Opcode::VhighBits
+ | Opcode::Iconcat
+ | Opcode::Isplit => {
  implemented_in_isle(ctx);
  }
 
  Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
 
- Opcode::VanyTrue => {
- let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
- let src_ty = ctx.input_ty(insn, 0);
- assert_eq!(src_ty.bits(), 128);
- let src = put_input_in_reg(ctx, inputs[0]);
- // Set the ZF if the result is all zeroes.
- ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
- // If the ZF is not set, place a 1 in `dst`.
- ctx.emit(Inst::setcc(CC::NZ, dst));
- }
-
- Opcode::VallTrue => {
- let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
- let src_ty = ctx.input_ty(insn, 0);
- assert_eq!(src_ty.bits(), 128);
- let src = input_to_reg_mem(ctx, inputs[0]);
-
- let eq = |ty: Type| match ty.lane_bits() {
- 8 => SseOpcode::Pcmpeqb,
- 16 => SseOpcode::Pcmpeqw,
- 32 => SseOpcode::Pcmpeqd,
- 64 => SseOpcode::Pcmpeqq,
- _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
- };
-
- // Initialize a register with all 0s.
- let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
- ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
- // Compare to see what lanes are filled with all 1s.
- ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
- // Set the ZF if the result is all zeroes.
- ctx.emit(Inst::xmm_cmp_rm_r(
- SseOpcode::Ptest,
- RegMem::from(tmp),
- tmp.to_reg(),
- ));
- // If the ZF is set, place a 1 in `dst`.
- ctx.emit(Inst::setcc(CC::Z, dst));
- }
-
- Opcode::VhighBits => {
- let src = put_input_in_reg(ctx, inputs[0]);
- let src_ty = ctx.input_ty(insn, 0);
- debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
- let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
- debug_assert!(dst.to_reg().class() == RegClass::Int);
-
- // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
- // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
- // the instruction can access additional registers when used with a REX.R prefix. The
- // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
- // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
- // unnecessary (`OperandSize` is used for setting/clearing REX.W).
- let size = OperandSize::Size32;
-
- match src_ty {
- types::I8X16 | types::B8X16 => {
- ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
- }
- types::I32X4 | types::B32X4 | types::F32X4 => {
- ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
- }
- types::I64X2 | types::B64X2 | types::F64X2 => {
- ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
- }
- types::I16X8 | types::B16X8 => {
- // There is no x86 instruction for extracting the high bit of 16-bit lanes so
- // here we:
- // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
- // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
- // - use PMOVMSKB to gather the high bits; now we have duplicates, though
- // - shift away the bottom 8 high bits to remove the duplicates.
- let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
- ctx.emit(Inst::gen_move(tmp, src, src_ty));
- ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
- ctx.emit(Inst::xmm_to_gpr(
- SseOpcode::Pmovmskb,
- tmp.to_reg(),
- dst,
- size,
- ));
- ctx.emit(Inst::shift_r(
- OperandSize::Size64,
- ShiftKind::ShiftRightLogical,
- Some(8),
- dst,
- ));
- }
- _ => unimplemented!("unknown input type {} for {}", src_ty, op),
- }
- }
-
- Opcode::Iconcat => {
- let ty = ctx.output_ty(insn, 0);
- assert_eq!(
- ty,
- types::I128,
- "Iconcat not expected to be used for non-128-bit type"
- );
- assert_eq!(ctx.input_ty(insn, 0), types::I64);
- assert_eq!(ctx.input_ty(insn, 1), types::I64);
- let lo = put_input_in_reg(ctx, inputs[0]);
- let hi = put_input_in_reg(ctx, inputs[1]);
- let dst = get_output_reg(ctx, outputs[0]);
- ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
- ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
- }
-
- Opcode::Isplit => {
- let ty = ctx.input_ty(insn, 0);
- assert_eq!(
- ty,
- types::I128,
- "Isplit not expected to be used for non-128-bit type"
- );
- assert_eq!(ctx.output_ty(insn, 0), types::I64);
- assert_eq!(ctx.output_ty(insn, 1), types::I64);
- let src = put_input_in_regs(ctx, inputs[0]);
- let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
- let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
- ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
- ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
- }
-
  Opcode::TlsValue => {
  let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
  let (name, _, _) = ctx.symbol_value(insn).unwrap();

@@ -17,20 +17,20 @@ block0(v0: i128, v1: i8):
 ; pushq %rbp
 ; movq %rsp, %rbp
 ; block0:
-; movzbq %dl, %rax
-; movq %rax, %rcx
+; movzbq %dl, %rcx
 ; movq %rdi, %rdx
 ; shlq %cl, %rdx, %rdx
 ; shlq %cl, %rsi, %rsi
-; movq %rcx, %r8
+; movq %rcx, %rax
 ; movl $64, %ecx
-; subq %rcx, %r8, %rcx
+; movq %rax, %r10
+; subq %rcx, %r10, %rcx
 ; shrq %cl, %rdi, %rdi
 ; xorq %rax, %rax, %rax
-; testq $127, %r8
+; testq $127, %r10
 ; cmovzq %rax, %rdi, %rdi
 ; orq %rdi, %rsi, %rdi
-; testq $64, %r8
+; testq $64, %r10
 ; cmovzq %rdx, %rax, %rax
 ; cmovzq %rdi, %rdx, %rdx
 ; movq %rbp, %rsp

@@ -41,9 +41,9 @@ block0(v0: i64x2):
 ; pushq %rbp
 ; movq %rsp, %rbp
 ; block0:
-; pxor %xmm4, %xmm4, %xmm4
-; pcmpeqq %xmm4, %xmm0, %xmm4
-; ptest %xmm4, %xmm4
+; pxor %xmm3, %xmm3, %xmm3
+; pcmpeqq %xmm0, %xmm3, %xmm0
+; ptest %xmm0, %xmm0
 ; setz %al
 ; movq %rbp, %rsp
 ; popq %rbp

@@ -16,24 +16,25 @@ block0(v0: i128, v1: i8):
 ; pushq %rbp
 ; movq %rsp, %rbp
 ; block0:
-; movzbq %dl, %rdx
-; movq %rdx, %rcx
+; movzbq %dl, %rcx
 ; shrq %cl, %rdi, %rdi
-; movq %rsi, %r9
-; sarq %cl, %r9, %r9
+; movq %rsi, %rdx
+; sarq %cl, %rdx, %rdx
+; movq %rcx, %rax
 ; movl $64, %ecx
-; subq %rcx, %rdx, %rcx
-; movq %rsi, %r8
-; shlq %cl, %r8, %r8
-; xorq %r10, %r10, %r10
-; testq $127, %rdx
-; cmovzq %r10, %r8, %r8
-; orq %rdi, %r8, %rdi
+; movq %rax, %r11
+; subq %rcx, %r11, %rcx
+; movq %rsi, %rax
+; shlq %cl, %rax, %rax
+; xorq %r8, %r8, %r8
+; testq $127, %r11
+; cmovzq %r8, %rax, %rax
+; orq %rdi, %rax, %rdi
 ; sarq $63, %rsi, %rsi
-; testq $64, %rdx
-; movq %r9, %rax
+; testq $64, %r11
+; movq %rdx, %rax
 ; cmovzq %rdi, %rax, %rax
-; cmovzq %r9, %rsi, %rsi
+; cmovzq %rdx, %rsi, %rsi
 ; movq %rsi, %rdx
 ; movq %rbp, %rsp
 ; popq %rbp