Skip to content

Commit

Permalink
x64: Refactor lowerings for insertlane (#8167)
Browse files Browse the repository at this point in the history
* x64: Refactor lowerings for `insertlane`

Going through old PRs I stumbled across #2716 which is quite old at this
point. Upon adding the tests to `main` I see that most of it is actually
implemented except for load-lane-from-memory where the lane size is 8 or
16 bits. That requires explicitly opting-in with `sinkable_load_exact`
so this PR now subsumes the tests of #2716 in addition to implementing
this missing hole in lowerings.

This refactoring shuffles around where definitions are located to more
easily have access to `Value` to perform the relevant match. The generic
`vec_insert_lane` helper is now gone as well in lieu of direct matches
on `insertlane` lowerings.

Closes #2716

* Remove a no-longer-needed helper function
  • Loading branch information
alexcrichton authored Mar 18, 2024
1 parent bcd0119 commit 85afaac
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 80 deletions.
109 changes: 53 additions & 56 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1485,20 +1485,12 @@

;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))
(vec_insert_lane ty vec val idx))

;; Helper function used below for `insertlane` but also here for other
;; lowerings.
;;
;; Note that the `Type` used here is the type of vector the insertion is
;; happening into, or the type of the first `Reg` argument.
(decl vec_insert_lane (Type Xmm RegMem u8) Xmm)

;; i8x16.replace_lane
(rule 1 (vec_insert_lane $I8X16 vec val idx)
(if-let $true (use_sse41))
(x64_pinsrb vec val idx))
(rule 1 (lower (insertlane vec @ (value_type $I8X16) val (u8_from_uimm8 idx)))
(if-let $true (use_sse41))
(x64_pinsrb vec val idx))
(rule 2 (lower (insertlane vec @ (value_type $I8X16) (sinkable_load_exact val) (u8_from_uimm8 idx)))
(if-let $true (use_sse41))
(x64_pinsrb vec val idx))

;; This lowering is particularly unoptimized and is mostly just here to work
;; rather than here to be fast. Requiring SSE 4.1 for the above lowering isn't
Expand All @@ -1524,7 +1516,7 @@
;; This all, laboriously, gets the `val` into the desired lane so it's then
;; `por`'d with the original vec-with-a-hole to produce the final result of the
;; insertion.
(rule (vec_insert_lane $I8X16 vec val n)
(rule (lower (insertlane vec @ (value_type $I8X16) val (u8_from_uimm8 n)))
(let ((vec_with_hole Xmm (x64_pand vec (insert_i8x16_lane_hole n)))
(val Gpr (x64_movzx (ExtMode.BL) val))
(val Gpr (x64_shl $I32 val (Imm8Reg.Imm8 (u8_shl (u8_and n 3) 3))))
Expand All @@ -1540,102 +1532,108 @@
(rule (insert_i8x16_lane_pshufd_imm 2) 0b01_00_01_01)
(rule (insert_i8x16_lane_pshufd_imm 3) 0b00_01_01_01)


;; i16x8.replace_lane
(rule (vec_insert_lane $I16X8 vec val idx)
(x64_pinsrw vec val idx))
(rule (lower (insertlane vec @ (value_type $I16X8) val (u8_from_uimm8 idx)))
(x64_pinsrw vec val idx))
(rule 1 (lower (insertlane vec @ (value_type $I16X8) (sinkable_load_exact val) (u8_from_uimm8 idx)))
(x64_pinsrw vec val idx))

;; i32x4.replace_lane
(rule 1 (vec_insert_lane $I32X4 vec val idx)
(rule 1 (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 idx)))
(if-let $true (use_sse41))
(x64_pinsrd vec val idx))

(rule (vec_insert_lane $I32X4 vec val 0)
(x64_movss_regmove vec (x64_movd_to_xmm val)))
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 0)))
(x64_movss_regmove vec (x64_movd_to_xmm val)))

;; tmp = [ vec[1] vec[0] val[1] val[0] ]
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
(rule (vec_insert_lane $I32X4 vec val 1)
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 1)))
(let ((val Xmm (x64_movd_to_xmm val))
(vec Xmm vec))
(x64_shufps (x64_punpcklqdq val vec) vec 0b11_10_00_10)))

;; tmp = [ vec[0] vec[3] val[0] val[0] ]
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
(rule (vec_insert_lane $I32X4 vec val 2)
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 2)))
(let ((val Xmm (x64_movd_to_xmm val))
(vec Xmm vec))
(x64_shufps vec (x64_shufps val vec 0b00_11_00_00) 0b10_00_01_00)))

;; tmp = [ vec[3] vec[2] val[1] val[0] ]
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
(rule (vec_insert_lane $I32X4 vec val 3)
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 3)))
(let ((val Xmm (x64_movd_to_xmm val))
(vec Xmm vec))
(x64_shufps vec (x64_shufps val vec 0b11_10_01_00) 0b00_10_01_00)))

;; i64x2.replace_lane
(rule 1 (vec_insert_lane $I64X2 vec val idx)
(rule 1 (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 idx)))
(if-let $true (use_sse41))
(x64_pinsrq vec val idx))
(rule (vec_insert_lane $I64X2 vec val 0)
(rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 0)))
(x64_movsd_regmove vec (x64_movq_to_xmm val)))
(rule (vec_insert_lane $I64X2 vec val 1)
(rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 1)))
(x64_punpcklqdq vec (x64_movq_to_xmm val)))

(rule 1 (lower (insertlane vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx)))
(if-let $true (use_sse41))
(x64_insertps vec val (sse_insertps_lane_imm idx)))
(rule (lower (insertlane vec @ (value_type $F32X4) val (u8_from_uimm8 idx)))
(f32x4_insertlane vec val idx))

;; Helper function used below for `insertlane` but also here for other
(decl f32x4_insertlane (Xmm Xmm u8) Xmm)

;; f32x4.replace_lane
(rule 1 (vec_insert_lane $F32X4 vec val idx)
(rule 1 (f32x4_insertlane vec val idx)
(if-let $true (use_sse41))
(x64_insertps vec val (sse_insertps_lane_imm idx)))

;; f32x4.replace_lane 0 - without insertps
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0)
;; External rust code used to calculate the immediate value to `insertps`.
(decl sse_insertps_lane_imm (u8) u8)
(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm)

;; f32x4.replace_lane 0
(rule (f32x4_insertlane vec val 0)
(x64_movss_regmove vec val))

;; f32x4.replace_lane 1 - without insertps
;; f32x4.replace_lane 1
;; tmp = [ vec[1] vec[0] val[1] val[0] ]
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1)
(rule (f32x4_insertlane vec val 1)
(let ((tmp Xmm (x64_movlhps val vec)))
(x64_shufps tmp vec 0b11_10_00_10)))

;; f32x4.replace_lane 2 - without insertps
;; f32x4.replace_lane 2
;; tmp = [ vec[0] vec[3] val[0] val[0] ]
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2)
(rule (f32x4_insertlane vec val 2)
(let ((tmp Xmm (x64_shufps val vec 0b00_11_00_00)))
(x64_shufps vec tmp 0b10_00_01_00)))

;; f32x4.replace_lane 3 - without insertps
;; f32x4.replace_lane 3
;; tmp = [ vec[3] vec[2] val[1] val[0] ]
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3)
(rule (f32x4_insertlane vec val 3)
(let ((tmp Xmm (x64_shufps val vec 0b11_10_01_00)))
(x64_shufps vec tmp 0b00_10_01_00)))

;; Recursively delegate to the above rules by loading from memory first.
(rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx)
(vec_insert_lane $F32X4 vec (x64_movss_load addr) idx))

;; External rust code used to calculate the immediate value to `insertps`.
(decl sse_insertps_lane_imm (u8) u8)
(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm)

;; f64x2.replace_lane 0
;;
;; Here the `movsd` instruction is used specifically to specialize moving
;; into the fist lane where unlike above cases we're not using the lane
;; immediate as an immediate to the instruction itself.
(rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0)
(rule (lower (insertlane vec @ (value_type $F64X2) val (u8_from_uimm8 0)))
(x64_movsd_regmove vec val))
(rule (vec_insert_lane $F64X2 vec (RegMem.Mem val) 0)
(x64_movsd_regmove vec (x64_movsd_load val)))

;; f64x2.replace_lane 1
;;
;; Here the `movlhps` instruction is used specifically to specialize moving
;; into the second lane where unlike above cases we're not using the lane
;; immediate as an immediate to the instruction itself.
(rule (vec_insert_lane $F64X2 vec val 1)
(rule (lower (insertlane vec @ (value_type $F64X2) val (u8_from_uimm8 1)))
(x64_movlhps vec val))

;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Expand Down Expand Up @@ -2804,9 +2802,9 @@
(x64_pshufd y 3)
(x64_pshufd z 3)))

(tmp Xmm (vec_insert_lane $F32X4 x0 x1 1))
(tmp Xmm (vec_insert_lane $F32X4 tmp x2 2))
(tmp Xmm (vec_insert_lane $F32X4 tmp x3 3))
(tmp Xmm (f32x4_insertlane x0 x1 1))
(tmp Xmm (f32x4_insertlane tmp x2 2))
(tmp Xmm (f32x4_insertlane tmp x3 3))
)
tmp))
(rule (lower (has_type $F64X2 (fma x y z)))
Expand All @@ -2820,7 +2818,7 @@
(x64_pshufd y 0xee)
(x64_pshufd z 0xee)))
)
(vec_insert_lane $F64X2 x0 x1 1)))
(x64_movlhps x0 x1)))


;; Special case for when the `fma` feature is active and a native instruction
Expand Down Expand Up @@ -4030,21 +4028,20 @@
(libcall LibCall (round_libcall $F32 imm))
(result Xmm (libcall_1 libcall a))
(a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
(result Xmm (vec_insert_lane $F32X4 result a1 1))
(result Xmm (f32x4_insertlane result a1 1))
(a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
(result Xmm (vec_insert_lane $F32X4 result a2 2))
(result Xmm (f32x4_insertlane result a2 2))
(a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
(result Xmm (vec_insert_lane $F32X4 result a3 3))
(result Xmm (f32x4_insertlane result a3 3))
)
result))
(rule (x64_round $F64X2 (RegMem.Reg a) imm)
(let (
(libcall LibCall (round_libcall $F64 imm))
(result Xmm (libcall_1 libcall a))
(a1 Xmm (libcall_1 libcall (x64_pshufd a 0b00_00_11_10)))
(result Xmm (vec_insert_lane $F64X2 result a1 1))
)
result))
(x64_movlhps result a1)))
(rule (x64_round ty (RegMem.Mem addr) imm)
(x64_round ty (RegMem.Reg (x64_load ty addr (ExtKind.ZeroExtend))) imm))

Expand Down
20 changes: 8 additions & 12 deletions cranelift/filetests/filetests/isa/x64/insertlane-avx.clif
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ block0(v0: f64x2, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovsd 0(%rdi), %xmm3
; vmovsd %xmm0, %xmm3, %xmm0
; vmovsd 0(%rdi), %xmm4
; vmovsd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -73,8 +73,8 @@ block0(v0: f64x2, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovsd (%rdi), %xmm3 ; trap: heap_oob
; vmovsd %xmm3, %xmm0, %xmm0
; vmovsd (%rdi), %xmm4 ; trap: heap_oob
; vmovsd %xmm4, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -90,8 +90,7 @@ block0(v0: i8x16, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq 0(%rdi), %rdx
; vpinsrb $1, %xmm0, %rdx, %xmm0
; vpinsrb $1, %xmm0, 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -101,8 +100,7 @@ block0(v0: i8x16, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbq (%rdi), %rdx ; trap: heap_oob
; vpinsrb $1, %edx, %xmm0, %xmm0
; vpinsrb $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -118,8 +116,7 @@ block0(v0: i16x8, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwq 0(%rdi), %rdx
; vpinsrw $1, %xmm0, %rdx, %xmm0
; vpinsrw $1, %xmm0, 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -129,8 +126,7 @@ block0(v0: i16x8, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwq (%rdi), %rdx ; trap: heap_oob
; vpinsrw $1, %edx, %xmm0, %xmm0
; vpinsrw $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down
20 changes: 8 additions & 12 deletions cranelift/filetests/filetests/isa/x64/insertlane.clif
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ block0(v0: f64x2, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movsd 0(%rdi), %xmm3
; movsd %xmm0, %xmm3, %xmm0
; movsd 0(%rdi), %xmm4
; movsd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -73,8 +73,8 @@ block0(v0: f64x2, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movsd (%rdi), %xmm3 ; trap: heap_oob
; movsd %xmm3, %xmm0
; movsd (%rdi), %xmm4 ; trap: heap_oob
; movsd %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -90,8 +90,7 @@ block0(v0: i8x16, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq 0(%rdi), %rdx
; pinsrb $1, %xmm0, %rdx, %xmm0
; pinsrb $1, %xmm0, 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -101,8 +100,7 @@ block0(v0: i8x16, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbq (%rdi), %rdx ; trap: heap_oob
; pinsrb $1, %edx, %xmm0
; pinsrb $1, (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -118,8 +116,7 @@ block0(v0: i16x8, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwq 0(%rdi), %rdx
; pinsrw $1, %xmm0, %rdx, %xmm0
; pinsrw $1, %xmm0, 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -129,8 +126,7 @@ block0(v0: i16x8, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwq (%rdi), %rdx ; trap: heap_oob
; pinsrw $1, %edx, %xmm0
; pinsrw $1, (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down
Loading

0 comments on commit 85afaac

Please sign in to comment.