-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
x64: Lower shuffle and swizzle in ISLE #4772
Changes from 5 commits
b5b9ef6
ea60c63
b80e37f
31fd919
d52bd6d
ff13b34
bf13c84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1395,6 +1395,9 @@ | |
(decl avx512bitalg_enabled () Type) | ||
(extern extractor avx512bitalg_enabled avx512bitalg_enabled) | ||
|
||
(decl avx512vbmi_enabled () Type) | ||
(extern extractor avx512vbmi_enabled avx512vbmi_enabled) | ||
|
||
(decl use_lzcnt () Type) | ||
(extern extractor use_lzcnt use_lzcnt) | ||
|
||
|
@@ -2735,6 +2738,19 @@ | |
src1 | ||
src2)) | ||
|
||
;; Helper for creating `vpermi2b` instructions. | ||
;; | ||
;; Requires AVX-512 vl and vbmi extensions. | ||
(decl x64_vpermi2b (Xmm Xmm Xmm) Xmm) | ||
(rule (x64_vpermi2b src1 src2 src3) | ||
(let ((dst WritableXmm (temp_writable_xmm)) | ||
(_ Unit (emit (gen_move $I8X16 dst src3))) | ||
(_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b) | ||
src1 | ||
src2 | ||
dst)))) | ||
dst)) | ||
|
||
;; Helper for creating `MInst.MulHi` instructions. | ||
;; | ||
;; Returns the (lo, hi) register halves of the multiplication. | ||
|
@@ -3621,6 +3637,43 @@ | |
(let ((dst WritableGpr (pinned_writable_gpr))) | ||
(SideEffectNoResult.Inst (gen_move $I64 dst val)))) | ||
|
||
;;;; Shuffle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||
|
||
;; Produce a mask suitable for use with `pshufb` for permuting the argument to | ||
;; shuffle, when the arguments are the same (i.e. `shuffle a a mask`). This will | ||
;; map all indices in the range 0..31 to the range 0..15. | ||
(decl shuffle_0_31_mask (VecMask) VCodeConstant) | ||
(extern constructor shuffle_0_31_mask shuffle_0_31_mask) | ||
|
||
;; Produce a mask suitable for use with `pshufb` for permuting the lhs of a | ||
;; `shuffle` operation (lanes 0-15). | ||
(decl shuffle_0_15_mask (VecMask) VCodeConstant) | ||
(extern constructor shuffle_0_15_mask shuffle_0_15_mask) | ||
|
||
;; Produce a mask suitable for use with `pshufb` for permuting the rhs of a | ||
;; `shuffle` operation (lanes 16-31). | ||
(decl shuffle_16_31_mask (VecMask) VCodeConstant) | ||
(extern constructor shuffle_16_31_mask shuffle_16_31_mask) | ||
|
||
;; Produce a permutation suitable for use with `vpermi2b`, for permuting two | ||
;; I8X16 vectors simultaneously. NOTE: this will not avoid out-of-bounds values, | ||
;; and the internal lane value masking of vpermi2b will come into play. If you | ||
;; need the out-of-bounds behavior of shuffle, you'll need to also mask the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we say what that behavior is and/or mention "CLIF-level shuffle" here? otherwise it's a bit unclear if one doesn't already have the context I think. |
||
;; result of vpermi2b to get the expected out-of-bounds behavior. | ||
(decl perm_from_mask (VecMask) VCodeConstant) | ||
(extern constructor perm_from_mask perm_from_mask) | ||
|
||
;; If the mask that would be given to `shuffle` contains any out-of-bounds | ||
;; indices, return a mask that will zero those. | ||
(decl perm_from_mask_with_zeros (VCodeConstant VCodeConstant) VecMask) | ||
(extern extractor perm_from_mask_with_zeros perm_from_mask_with_zeros) | ||
|
||
;;;; Swizzle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||
|
||
;; Create a mask for zeroing out-of-bounds lanes of the swizzle mask. | ||
(decl swizzle_zero_mask () VCodeConstant) | ||
(extern constructor swizzle_zero_mask swizzle_zero_mask) | ||
|
||
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||
|
||
(convert Gpr InstOutput output_gpr) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3510,3 +3510,50 @@ | |
;; register allocator a definition for the output virtual register. | ||
(rule (lower (raw_bitcast val)) | ||
(put_in_regs val)) | ||
|
||
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||
|
||
;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM | ||
;; register. We statically build `constructed_mask` to zero out any unknown lane | ||
;; indices (may not be completely necessary: verification could fail incorrect | ||
;; mask values) and fix the indexes to all point to the `dst` vector. | ||
(rule (lower (shuffle a a (vec_mask_from_immediate mask))) | ||
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask)))) | ||
|
||
;; For the case where the shuffle mask contains out-of-bounds values (values | ||
;; greater than 31) we must mask off those resulting values in the result of | ||
;; `vpermi2b`. | ||
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled)) | ||
(shuffle a b (vec_mask_from_immediate | ||
(perm_from_mask_with_zeros mask zeros))))) | ||
(x64_andps | ||
(x64_xmm_load_const $I8X16 zeros) | ||
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)))) | ||
|
||
;; However, if the shuffle mask contains no out-of-bounds values, we can use | ||
;; `vpermi2b` without any masking. | ||
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled)) | ||
(shuffle a b (vec_mask_from_immediate mask)))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we're relying on implicit firing-order heuristics (the above rule before this one, specifically There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, we should have enough test coverage to catch problems with these two: there are precise-output tests for both rules. |
||
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask)))) | ||
|
||
;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR | ||
;; them together. This is necessary due to PSHUFB semantics. As in the case | ||
;; above, we build the `constructed_mask` for each case statically. | ||
(rule (lower (shuffle a b (vec_mask_from_immediate mask))) | ||
(x64_por | ||
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask))) | ||
(x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask))))) | ||
|
||
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/shuffle/swizzle/ ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching that! |
||
|
||
;; SIMD swizzle; the following inefficient implementation is due to the Wasm | ||
;; SIMD spec requiring mask indexes greater than 15 to have the same semantics | ||
;; as a 0 index. For the spec discussion, see | ||
;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the | ||
;; Wasm SIMD semantics for this instruction. The instruction format maps to | ||
;; variables like: %dst = swizzle %src, %mask | ||
(rule (lower (swizzle src mask)) | ||
(let ((mask Xmm (x64_paddusb | ||
mask | ||
(x64_xmm_load_const $I8X16 (swizzle_zero_mask))))) | ||
(x64_pshufb src mask))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a little unhappy about this, but we don't have an encoding for xmm instructions that have three arguments currently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, we can definitely add such a thing later, and should I think; we'll get to this as part of our "no more mod operands" cleanup on regalloc operands, if not before.