Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Lower widening and narrowing operations in ISLE #4722

Merged
merged 6 commits into from
Aug 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -2500,6 +2500,21 @@
(rule (x64_packsswb src1 src2)
(xmm_rm_r $I8X16 (SseOpcode.Packsswb) src1 src2))

;; Helper for creating `packssdw` instructions.
(decl x64_packssdw (Xmm XmmMem) Xmm)
(rule (x64_packssdw src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Packssdw) src1 src2))

;; Helper for creating `packuswb` instructions.
(decl x64_packuswb (Xmm XmmMem) Xmm)
(rule (x64_packuswb src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Packuswb) src1 src2))

;; Helper for creating `packusdw` instructions.
(decl x64_packusdw (Xmm XmmMem) Xmm)
(rule (x64_packusdw src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Packusdw) src1 src2))

;; Helper for creating `MInst.XmmRmRImm` instructions.
(decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Xmm)
(rule (xmm_rm_r_imm op src1 src2 imm size)
Expand Down Expand Up @@ -3051,10 +3066,16 @@
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
dst))

;; Helper for creating `cvttps2dq` instructions.
(decl x64_cvttps2dq (Type XmmMem) Xmm)
(rule (x64_cvttps2dq ty x)
(xmm_unary_rm_r (SseOpcode.Cvttps2dq) x))

;; Helper for creating `cvttpd2dq` instructions.
(decl x64_cvttpd2dq (XmmMem) Xmm)
(rule (x64_cvttpd2dq x)
(xmm_unary_rm_r (SseOpcode.Cvttpd2dq) x))

(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
Expand Down Expand Up @@ -3273,6 +3294,11 @@
(decl iadd_pairwise_addd_const_32 () VCodeConstant)
(extern constructor iadd_pairwise_addd_const_32 iadd_pairwise_addd_const_32)

;;;; snarrow constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(decl snarrow_umax_mask () VCodeConstant)
(extern constructor snarrow_umax_mask snarrow_umax_mask)

;;;; Comparisons ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(type IcmpCondResult (enum (Condition (producer ProducesFlags) (cc CC))))
Expand Down
86 changes: 86 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3226,3 +3226,89 @@
(addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32))))
(x64_paddd dst addd_const)))

;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
(x64_pmovsxbw val))

(rule (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
(x64_pmovsxwd val))

(rule (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
(x64_pmovsxdq val))

;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
(x64_pmovsxbw (x64_palignr val val 8 (OperandSize.Size32))))

(rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
(x64_pmovsxwd (x64_palignr val val 8 (OperandSize.Size32))))

(rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
(x64_pmovsxdq (x64_pshufd val 0xEE (OperandSize.Size32))))

;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
(x64_pmovzxbw val))

(rule (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
(x64_pmovzxwd val))

(rule (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
(x64_pmovzxdq val))

;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
(x64_pmovzxbw (x64_palignr val val 8 (OperandSize.Size32))))

(rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
(x64_pmovzxwd (x64_palignr val val 8 (OperandSize.Size32))))

(rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
(x64_pmovzxdq (x64_pshufd val 0xEE (OperandSize.Size32))))

;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16 (snarrow a @ (value_type $I16X8) b)))
(x64_packsswb a b))

(rule (lower (has_type $I16X8 (snarrow a @ (value_type $I32X4) b)))
(x64_packssdw a b))

;; We're missing a `snarrow` case for $I64X2
;; https://github.com/bytecodealliance/wasmtime/issues/4734

;; This rule is a special case for handling the translation of the wasm op
;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an
;; implementation of `snarrow` for `I64X2`.
(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (fcvt_to_sint_sat a))
(vconst (u128_from_constant 0)))))
(let (;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
;; MOVE xmm_tmp, xmm_x
;; CMPEQPD xmm_tmp, xmm_x
;; MOVE xmm_y, xmm_x
;; ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)]
;; MINPD xmm_y, xmm_tmp
;; CVTTPD2DQ xmm_y, xmm_y

(tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal)))
(umax_mask Xmm (x64_xmm_load_const $F64X2 (snarrow_umax_mask)))

;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
(tmp1 Xmm (x64_andps tmp1 umax_mask))
(dst Xmm (x64_minpd a tmp1)))
(x64_cvttpd2dq dst)))

;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16 (unarrow a @ (value_type $I16X8) b)))
(x64_packuswb a b))

(rule (lower (has_type $I16X8 (unarrow a @ (value_type $I32X4) b)))
(x64_packusdw a b))

;; We're missing a `unarrow` case for $I64X2
;; https://github.com/bytecodealliance/wasmtime/issues/4734
214 changes: 7 additions & 207 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,216 +562,16 @@ fn lower_insn_to_regs(
| Opcode::FcvtToSint
| Opcode::FcvtToUintSat
| Opcode::FcvtToSintSat
| Opcode::IaddPairwise => {
| Opcode::IaddPairwise
| Opcode::UwidenHigh
| Opcode::UwidenLow
| Opcode::SwidenHigh
| Opcode::SwidenLow
| Opcode::Snarrow
| Opcode::Unarrow => {
implemented_in_isle(ctx);
}

Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if output_ty.is_vector() {
match op {
Opcode::SwidenLow => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(src), dst));
}
(types::I16X8, types::I32X4) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::reg(src), dst));
}
(types::I32X4, types::I64X2) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::reg(src), dst));
}
_ => unreachable!(),
},
Opcode::SwidenHigh => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
ctx.emit(Inst::gen_move(dst, src, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(src),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
}
(types::I16X8, types::I32X4) => {
ctx.emit(Inst::gen_move(dst, src, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(src),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
}
(types::I32X4, types::I64X2) => {
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(src),
dst,
0xEE,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::from(dst), dst));
}
_ => unreachable!(),
},
Opcode::UwidenLow => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(src), dst));
}
(types::I16X8, types::I32X4) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
}
(types::I32X4, types::I64X2) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::reg(src), dst));
}
_ => unreachable!(),
},
Opcode::UwidenHigh => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
ctx.emit(Inst::gen_move(dst, src, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(src),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
}
(types::I16X8, types::I32X4) => {
ctx.emit(Inst::gen_move(dst, src, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(src),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
}
(types::I32X4, types::I64X2) => {
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(src),
dst,
0xEE,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::from(dst), dst));
}
_ => unreachable!(),
},
_ => unreachable!(),
}
} else {
panic!("Unsupported non-vector type for widen instruction {:?}", ty);
}
}
Opcode::Snarrow | Opcode::Unarrow => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if output_ty.is_vector() {
match op {
Opcode::Snarrow => match (input_ty, output_ty) {
(types::I16X8, types::I8X16) => {
let src1 = put_input_in_reg(ctx, inputs[0]);
let src2 = put_input_in_reg(ctx, inputs[1]);
ctx.emit(Inst::gen_move(dst, src1, input_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
}
(types::I32X4, types::I16X8) => {
let src1 = put_input_in_reg(ctx, inputs[0]);
let src2 = put_input_in_reg(ctx, inputs[1]);
ctx.emit(Inst::gen_move(dst, src1, input_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
}
// TODO: The type we are expecting as input as actually an F64X2 but the instruction is only defined
// for integers so here we use I64X2. This is a separate issue that needs to be fixed in instruction.rs.
(types::I64X2, types::I32X4) => {
if let Some(fcvt_inst) =
matches_input(ctx, inputs[0], Opcode::FcvtToSintSat)
{
//y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
//MOVE xmm_tmp, xmm_x
//CMPEQPD xmm_tmp, xmm_x
//MOVE xmm_y, xmm_x
//ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)]
//MINPD xmm_y, xmm_tmp
//CVTTPD2DQ xmm_y, xmm_y

let fcvt_input = InsnInput {
insn: fcvt_inst,
input: 0,
};
let src = put_input_in_reg(ctx, fcvt_input);
ctx.emit(Inst::gen_move(dst, src, input_ty));
let tmp1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp1, src, input_ty));
let cond = FcmpImm::from(FloatCC::Equal);
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Cmppd,
RegMem::reg(src),
tmp1,
cond.encode(),
OperandSize::Size32,
));

// 2147483647.0 is equivalent to 0x41DFFFFFFFC00000
static UMAX_MASK: [u8; 16] = [
0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00,
0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41,
];
let umax_const =
ctx.use_constant(VCodeConstantData::WellKnown(&UMAX_MASK));
let umax_mask = ctx.alloc_tmp(types::F64X2).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(umax_const, umax_mask, types::F64X2));

//ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Andps,
RegMem::from(umax_mask),
tmp1,
));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Minpd, RegMem::from(tmp1), dst));
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Cvttpd2dq,
RegMem::from(dst),
dst,
));
} else {
unreachable!();
}
}
_ => unreachable!(),
},
Opcode::Unarrow => match (input_ty, output_ty) {
(types::I16X8, types::I8X16) => {
let src1 = put_input_in_reg(ctx, inputs[0]);
let src2 = put_input_in_reg(ctx, inputs[1]);
ctx.emit(Inst::gen_move(dst, src1, input_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
}
(types::I32X4, types::I16X8) => {
let src1 = put_input_in_reg(ctx, inputs[0]);
let src2 = put_input_in_reg(ctx, inputs[1]);
ctx.emit(Inst::gen_move(dst, src1, input_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
}
_ => unreachable!(),
},
_ => unreachable!(),
}
} else {
panic!("Unsupported non-vector type for widen instruction {:?}", ty);
}
}
Opcode::Bitcast => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/x64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,17 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
self.lower_ctx
.use_constant(VCodeConstantData::WellKnown(&IADD_PAIRWISE_ADDD_CONST_32))
}

#[inline]
fn snarrow_umax_mask(&mut self) -> VCodeConstant {
// 2147483647.0 is equivalent to 0x41DFFFFFFFC00000
static UMAX_MASK: [u8; 16] = [
0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
0xDF, 0x41,
];
self.lower_ctx
.use_constant(VCodeConstantData::WellKnown(&UMAX_MASK))
}
}

impl IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
Expand Down
Loading