Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Lower shuffle and swizzle in ISLE #4772

Merged
merged 7 commits into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1395,6 +1395,9 @@
(decl avx512bitalg_enabled () Type)
(extern extractor avx512bitalg_enabled avx512bitalg_enabled)

(decl avx512vbmi_enabled () Type)
(extern extractor avx512vbmi_enabled avx512vbmi_enabled)

(decl use_lzcnt () Type)
(extern extractor use_lzcnt use_lzcnt)

Expand Down Expand Up @@ -2735,6 +2738,19 @@
src1
src2))

;; Helper for creating `vpermi2b` instructions.
;;
;; Requires AVX-512 vl and vbmi extensions.
(decl x64_vpermi2b (Xmm Xmm Xmm) Xmm)
(rule (x64_vpermi2b src1 src2 src3)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (gen_move $I8X16 dst src3)))
(_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b)
src1
src2
dst))))
Comment on lines +2747 to +2751
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little unhappy about this, but we don't have an encoding for xmm instructions that have three arguments currently.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we can definitely add such a thing later, and should I think; we'll get to this as part of our "no more mod operands" cleanup on regalloc operands, if not before.

dst))

;; Helper for creating `MInst.MulHi` instructions.
;;
;; Returns the (lo, hi) register halves of the multiplication.
Expand Down Expand Up @@ -3621,6 +3637,43 @@
(let ((dst WritableGpr (pinned_writable_gpr)))
(SideEffectNoResult.Inst (gen_move $I64 dst val))))

;;;; Shuffle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Produce a mask suitable for use with `pshufb` for permuting the argument to
;; shuffle, when the arguments are the same (i.e. `shuffle a a mask`). This will
;; map all indices in the range 0..31 to the range 0..15.
(decl shuffle_0_31_mask (VecMask) VCodeConstant)
(extern constructor shuffle_0_31_mask shuffle_0_31_mask)

;; Produce a mask suitable for use with `pshufb` for permuting the lhs of a
;; `shuffle` operation (lanes 0-15).
(decl shuffle_0_15_mask (VecMask) VCodeConstant)
(extern constructor shuffle_0_15_mask shuffle_0_15_mask)

;; Produce a mask suitable for use with `pshufb` for permuting the rhs of a
;; `shuffle` operation (lanes 16-31).
(decl shuffle_16_31_mask (VecMask) VCodeConstant)
(extern constructor shuffle_16_31_mask shuffle_16_31_mask)

;; Produce a permutation suitable for use with `vpermi2b`, for permuting two
;; I8X16 vectors simultaneously. NOTE: this will not avoid out-of-bounds values,
;; and the internal lane value masking of vpermi2b will come into play. If you
;; need the out-of-bounds behavior of shuffle, you'll need to also mask the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we say what that behavior is and/or mention "CLIF-level shuffle" here? otherwise it's a bit unclear if one doesn't already have the context I think.

;; result of vpermi2b to get the expected out-of-bounds behavior.
(decl perm_from_mask (VecMask) VCodeConstant)
(extern constructor perm_from_mask perm_from_mask)

;; If the mask that would be given to `shuffle` contains any out-of-bounds
;; indices, return a mask that will zero those.
(decl perm_from_mask_with_zeros (VCodeConstant VCodeConstant) VecMask)
(extern extractor perm_from_mask_with_zeros perm_from_mask_with_zeros)

;;;; Swizzle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
(decl swizzle_zero_mask () VCodeConstant)
(extern constructor swizzle_zero_mask swizzle_zero_mask)

;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(convert Gpr InstOutput output_gpr)
Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ impl Inst {
dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
}
}

fn xmm_rm_r_evex(op: Avx512Opcode, src1: RegMem, src2: Reg, dst: Writable<Reg>) -> Self {
src1.assert_regclass_is(RegClass::Float);
debug_assert!(src2.class() == RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmRmREvex {
op,
src1: XmmMem::new(src1).unwrap(),
src2: Xmm::new(src2).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}

#[test]
Expand Down
17 changes: 0 additions & 17 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,23 +316,6 @@ impl Inst {
}
}

pub(crate) fn xmm_rm_r_evex(
op: Avx512Opcode,
src1: RegMem,
src2: Reg,
dst: Writable<Reg>,
) -> Self {
src1.assert_regclass_is(RegClass::Float);
debug_assert!(src2.class() == RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmRmREvex {
op,
src1: XmmMem::new(src1).unwrap(),
src2: Xmm::new(src2).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUninitializedValue {
Expand Down
47 changes: 47 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3510,3 +3510,50 @@
;; register allocator a definition for the output virtual register.
(rule (lower (raw_bitcast val))
(put_in_regs val))

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
;; register. We statically build `constructed_mask` to zero out any unknown lane
;; indices (may not be completely necessary: verification could fail incorrect
;; mask values) and fix the indexes to all point to the `dst` vector.
(rule (lower (shuffle a a (vec_mask_from_immediate mask)))
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask))))

;; For the case where the shuffle mask contains out-of-bounds values (values
;; greater than 31) we must mask off those resulting values in the result of
;; `vpermi2b`.
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
(shuffle a b (vec_mask_from_immediate
(perm_from_mask_with_zeros mask zeros)))))
(x64_andps
(x64_xmm_load_const $I8X16 zeros)
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask))))

;; However, if the shuffle mask contains no out-of-bounds values, we can use
;; `vpermi2b` without any masking.
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
(shuffle a b (vec_mask_from_immediate mask))))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we're relying on implicit firing-order heuristics (the above rule before this one, specifically perm_from_mask... extractor before mask variable binding); I think tests below should ensure this works properly but just wanted to call it out to be sure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, we should have enough test coverage to catch problems with these two: there are precise-output tests for both rules.

(x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask))))

;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR
;; them together. This is necessary due to PSHUFB semantics. As in the case
;; above, we build the `constructed_mask` for each case statically.
(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
(x64_por
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask)))
(x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask)))))

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/shuffle/swizzle/ ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching that!


;; SIMD swizzle; the following inefficient implementation is due to the Wasm
;; SIMD spec requiring mask indexes greater than 15 to have the same semantics
;; as a 0 index. For the spec discussion, see
;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the
;; Wasm SIMD semantics for this instruction. The instruction format maps to
;; variables like: %dst = swizzle %src, %mask
(rule (lower (swizzle src mask))
(let ((mask Xmm (x64_paddusb
mask
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
(x64_pshufb src mask)))
132 changes: 3 additions & 129 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
// ISLE integration glue.
pub(super) mod isle;

use crate::data_value::DataValue;
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
use crate::isa::x64::abi::*;
use crate::isa::x64::inst::args::*;
Expand Down Expand Up @@ -585,139 +584,14 @@ fn lower_insn_to_regs(
| Opcode::SetPinnedReg
| Opcode::Vconst
| Opcode::RawBitcast
| Opcode::Insertlane => {
| Opcode::Insertlane
| Opcode::Shuffle
| Opcode::Swizzle => {
implemented_in_isle(ctx);
}

Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

Opcode::Shuffle => {
let ty = ty.unwrap();
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let lhs_ty = ctx.input_ty(insn, 0);
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = put_input_in_reg(ctx, inputs[1]);
let mask = match ctx.get_immediate(insn) {
Some(DataValue::V128(bytes)) => bytes.to_vec(),
_ => unreachable!("shuffle should always have a 16-byte immediate"),
};

// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
// 1 in the most significant position zeroes the lane.
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };

ctx.emit(Inst::gen_move(dst, rhs, ty));
if rhs == lhs {
// If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
// register. We statically build `constructed_mask` to zero out any unknown lane
// indices (may not be completely necessary: verification could fail incorrect mask
// values) and fix the indexes to all point to the `dst` vector.
let constructed_mask = mask
.iter()
// If the mask is greater than 15 it still may be referring to a lane in b.
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
.map(zero_unknown_lane_index)
.collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
// After loading the constructed mask in a temporary register, we use this to
// shuffle the `dst` register (remember that, in this case, it is the same as
// `src` so we disregard this register).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
} else {
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
assert!(
mask.iter().all(|b| *b < 32),
"shuffle mask values must be between 0 and 31"
);

// Load the mask into the destination register.
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
ctx.emit(Inst::xmm_load_const(constant, dst, ty));

// VPERMI2B has the exact semantics of Wasm's shuffle:
// permute the bytes in `src1` and `src2` using byte indexes
// in `dst` and store the byte results in `dst`.
ctx.emit(Inst::xmm_rm_r_evex(
Avx512Opcode::Vpermi2b,
RegMem::reg(rhs),
lhs,
dst,
));
} else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above,
// we build the `constructed_mask` for each case statically.

// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
let constructed_mask =
mask.iter().cloned().map(zero_unknown_lane_index).collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));

// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));

// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
}
}
}

Opcode::Swizzle => {
// SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
// requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
// the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
// semantics match the Wasm SIMD semantics for this instruction.
// The instruction format maps to variables like: %dst = swizzle %src, %mask
let ty = ty.unwrap();
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let src = put_input_in_reg(ctx, inputs[0]);
let swizzle_mask = put_input_in_reg(ctx, inputs[1]);

// Inform the register allocator that `src` and `dst` should be in the same register.
ctx.emit(Inst::gen_move(dst, src, ty));

// Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
static ZERO_MASK_VALUE: [u8; 16] = [
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
0x70, 0x70,
];
let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));

// Use the `zero_mask` on a writable `swizzle_mask`.
let swizzle_mask_tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::gen_move(swizzle_mask_tmp, swizzle_mask, ty));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddusb,
RegMem::from(zero_mask),
swizzle_mask_tmp,
));

// Shuffle `dst` using the fixed-up `swizzle_mask`.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pshufb,
RegMem::from(swizzle_mask_tmp),
dst,
));
}

Opcode::Extractlane => {
// The instruction format maps to variables like: %dst = extractlane %src, %lane
let ty = ty.unwrap();
Expand Down
Loading