Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Winch: Implement swizzle on x64 with AVX #10050

Merged
merged 1 commit into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions tests/disas/winch/x64/i8x16_swizzle/const_avx.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (result v128)
(i8x16.swizzle (v128.const i64x2 1 2) (v128.const i64x2 2 1))
)
)
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x10, %r11
;; cmpq %rsp, %r11
;; ja 0x53
;; 1c: movq %rdi, %r14
;; subq $0x10, %rsp
;; movq %rdi, 8(%rsp)
;; movq %rsi, (%rsp)
;; movdqu 0x2c(%rip), %xmm0
;; movdqu 0x34(%rip), %xmm1
;; vpaddusb 0x3c(%rip), %xmm0, %xmm0
;; vpshufb %xmm0, %xmm1, %xmm1
;; movdqa %xmm1, %xmm0
;; addq $0x10, %rsp
;; popq %rbp
;; retq
;; 53: ud2
;; 55: addb %al, (%rax)
;; 57: addb %al, (%rax)
;; 59: addb %al, (%rax)
;; 5b: addb %al, (%rax)
;; 5d: addb %al, (%rax)
;; 5f: addb %al, (%rdx)
;; 61: addb %al, (%rax)
;; 63: addb %al, (%rax)
;; 65: addb %al, (%rax)
;; 67: addb %al, (%rcx)
;; 69: addb %al, (%rax)
;; 6b: addb %al, (%rax)
;; 6d: addb %al, (%rax)
;; 6f: addb %al, (%rcx)
;; 71: addb %al, (%rax)
;; 73: addb %al, (%rax)
;; 75: addb %al, (%rax)
;; 77: addb %al, (%rdx)
;; 79: addb %al, (%rax)
;; 7b: addb %al, (%rax)
;; 7d: addb %al, (%rax)
;; 7f: addb %dh, 0x70(%rax)
;; 82: jo 0xf4
;; 84: jo 0xf6
;; 86: jo 0xf8
;; 88: jo 0xfa
;; 8a: jo 0xfc
;; 8c: jo 0xfe
;; 8e: jo 0x100
104 changes: 52 additions & 52 deletions tests/misc_testsuite/winch/_simd_lane.wast
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@
;; (f64x2.replace_lane 1 (local.get 0) (local.get 1)))

;; Swizzle and shuffle
;; (func (export "v8x16_swizzle") (param v128 v128) (result v128)
;; (i8x16.swizzle (local.get 0) (local.get 1)))
(func (export "v8x16_swizzle") (param v128 v128) (result v128)
(i8x16.swizzle (local.get 0) (local.get 1)))
(func (export "v8x16_shuffle-1") (param v128 v128) (result v128)
(i8x16.shuffle 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 (local.get 0) (local.get 1)))
(func (export "v8x16_shuffle-2") (param v128 v128) (result v128)
Expand Down Expand Up @@ -291,46 +291,46 @@
;; (assert_return (invoke "f64x2_replace_lane-last" (v128.const f64x2 0.0 0.0) (f64.const 0123456789.e019)) (v128.const f64x2 0.0 0123456789.e019))
;; (assert_return (invoke "f64x2_replace_lane-last" (v128.const f64x2 0.0 0.0) (f64.const 0123456789.e-019)) (v128.const f64x2 0.0 0123456789.e-019))

;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
;; (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
;; (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1)
;; (v128.const i8x16 -8 -7 -6 -5 -4 -3 -2 -1 16 17 18 19 20 21 22 23))
;; (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
;; (v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
;; (v128.const i8x16 115 114 113 112 111 110 109 108 107 106 105 104 103 102 101 100))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
;; (v128.const i8x16 -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -7 7 -8 8))
;; (v128.const i8x16 0 101 0 102 0 103 0 104 0 105 0 106 0 107 0 108))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
;; (v128.const i8x16 9 16 10 17 11 18 12 19 13 20 14 21 15 22 16 23))
;; (v128.const i8x16 109 0 110 0 111 0 112 0 113 0 114 0 115 0 0 0))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i8x16 0x64 0x65 0x66 0x67 0x68 0x69 0x6a 0x6b 0x6c 0x6d 0x6e 0x6f 0x70 0x71 0x72 0x73)
;; (v128.const i8x16 9 16 10 17 11 18 12 19 13 20 14 21 15 22 16 23))
;; (v128.const i8x16 0x6d 0 0x6e 0 0x6f 0 0x70 0 0x71 0 0x72 0 0x73 0 0 0))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i16x8 0x6465 0x6667 0x6869 0x6a6b 0x6c6d 0x6e6f 0x7071 0x7273)
;; (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
;; (v128.const i16x8 0x6465 0x6667 0x6869 0x6a6b 0x6c6d 0x6e6f 0x7071 0x7273))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i32x4 0x64656667 0x68696a6b 0x6c6d6e6f 0x70717273)
;; (v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
;; (v128.const i32x4 0x73727170 0x6f6e6d6c 0x6b6a6968 0x67666564))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const f32x4 nan -nan inf -inf)
;; (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
;; (v128.const i32x4 0x7fc00000 0xffc00000 0x7f800000 0xff800000))
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i32x4 0x67666564 0x6b6a6968 0x6f6e6d5c 0x73727170)
;; (v128.const f32x4 0.0 -0.0 inf -inf))
;; (v128.const i32x4 0x64646464 0x00646464 0x00006464 0x00006464))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
(v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
(v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1)
(v128.const i8x16 -8 -7 -6 -5 -4 -3 -2 -1 16 17 18 19 20 21 22 23))
(v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
(v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
(v128.const i8x16 115 114 113 112 111 110 109 108 107 106 105 104 103 102 101 100))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
(v128.const i8x16 -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -7 7 -8 8))
(v128.const i8x16 0 101 0 102 0 103 0 104 0 105 0 106 0 107 0 108))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115)
(v128.const i8x16 9 16 10 17 11 18 12 19 13 20 14 21 15 22 16 23))
(v128.const i8x16 109 0 110 0 111 0 112 0 113 0 114 0 115 0 0 0))
(assert_return (invoke "v8x16_swizzle"
(v128.const i8x16 0x64 0x65 0x66 0x67 0x68 0x69 0x6a 0x6b 0x6c 0x6d 0x6e 0x6f 0x70 0x71 0x72 0x73)
(v128.const i8x16 9 16 10 17 11 18 12 19 13 20 14 21 15 22 16 23))
(v128.const i8x16 0x6d 0 0x6e 0 0x6f 0 0x70 0 0x71 0 0x72 0 0x73 0 0 0))
(assert_return (invoke "v8x16_swizzle"
(v128.const i16x8 0x6465 0x6667 0x6869 0x6a6b 0x6c6d 0x6e6f 0x7071 0x7273)
(v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
(v128.const i16x8 0x6465 0x6667 0x6869 0x6a6b 0x6c6d 0x6e6f 0x7071 0x7273))
(assert_return (invoke "v8x16_swizzle"
(v128.const i32x4 0x64656667 0x68696a6b 0x6c6d6e6f 0x70717273)
(v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
(v128.const i32x4 0x73727170 0x6f6e6d6c 0x6b6a6968 0x67666564))
(assert_return (invoke "v8x16_swizzle"
(v128.const f32x4 nan -nan inf -inf)
(v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
(v128.const i32x4 0x7fc00000 0xffc00000 0x7f800000 0xff800000))
(assert_return (invoke "v8x16_swizzle"
(v128.const i32x4 0x67666564 0x6b6a6968 0x6f6e6d5c 0x73727170)
(v128.const f32x4 0.0 -0.0 inf -inf))
(v128.const i32x4 0x64646464 0x00646464 0x00006464 0x00006464))

(assert_return (invoke "v8x16_shuffle-1"
(v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
Expand Down Expand Up @@ -386,10 +386,10 @@
(v128.const i32x4 0x10203 0x4050607 0x8090a0b 0xc0d0e0f))

;; More literals
;; (assert_return (invoke "v8x16_swizzle"
;; (v128.const i32x4 1_234_567_890 0x1234_5678 01_234_567_890 0x0_1234_5678)
;; (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
;; (v128.const i32x4 0x4996_02d2 0x1234_5678 0x4996_02d2 0x1234_5678))
(assert_return (invoke "v8x16_swizzle"
(v128.const i32x4 1_234_567_890 0x1234_5678 01_234_567_890 0x0_1234_5678)
(v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
(v128.const i32x4 0x4996_02d2 0x1234_5678 0x4996_02d2 0x1234_5678))
(assert_return (invoke "v8x16_shuffle-1"
(v128.const i64x2 1_234_567_890_123_456_789_0 0x1234_5678_90AB_cdef)
(v128.const i64x2 01_234_567_890_123_456_789_0 0x0_1234_5678_90AB_cdef))
Expand Down Expand Up @@ -843,8 +843,8 @@
;; (global.set $g (f32x4.replace_lane 0 (local.get 0) (local.get 1)))
;; (return (global.get $g)))

;; (func (export "as-return-value-2") (param v128 v128) (result v128)
;; (return (i8x16.swizzle (local.get 0) (local.get 1))))
(func (export "as-return-value-2") (param v128 v128) (result v128)
(return (i8x16.swizzle (local.get 0) (local.get 1))))
(func (export "as-global_set-value-2") (param v128 v128) (result v128)
(global.set $h (i8x16.shuffle 0 1 2 3 4 5 6 7 24 25 26 27 28 29 30 31 (local.get 0) (local.get 1)))
(return (global.get $h)))
Expand All @@ -862,10 +862,10 @@
(assert_return (invoke "as-local_set-value" (v128.const i32x4 -1 -1 -1 -1)) (i32.const -1))
;; (assert_return (invoke "as-global_set-value-1" (v128.const f32x4 0 0 0 0)(f32.const 3.14)) (v128.const f32x4 3.14 0 0 0))

;; (assert_return (invoke "as-return-value-2"
;; (v128.const i8x16 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1)
;; (v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
;; (v128.const i8x16 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16))
(assert_return (invoke "as-return-value-2"
(v128.const i8x16 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1)
(v128.const i8x16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0))
(v128.const i8x16 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16))
(assert_return (invoke "as-global_set-value-2"
(v128.const i8x16 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1)
(v128.const i8x16 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1))
Expand Down
4 changes: 4 additions & 0 deletions winch/codegen/src/isa/aarch64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ impl Masm for MacroAssembler {
bail!(CodeGenError::unimplemented_masm_instruction())
}

fn swizzle(&mut self, _dst: WritableReg, _lhs: Reg, _rhs: Reg) -> Result<()> {
bail!(CodeGenError::unimplemented_masm_instruction())
}

fn atomic_rmw(
&mut self,
_addr: Self::Address,
Expand Down
32 changes: 32 additions & 0 deletions winch/codegen/src/isa/x64/asm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1631,6 +1631,17 @@ impl Assembler {
});
}

/// Shuffles bytes in `src` according to contents of `mask` and puts
/// result in `dst`.
pub fn xmm_vpshufb_rrr(&mut self, dst: WritableReg, src: Reg, mask: Reg) {
self.emit(Inst::XmmRmiRVex {
op: args::AvxOpcode::Vpshufb,
src1: src.into(),
src2: XmmMemImm::unwrap_new(RegMemImm::reg(mask.into())),
dst: dst.to_reg().into(),
})
}

/// Bitwise OR of `src1` and `src2`.
pub fn vpor(&mut self, dst: WritableReg, src1: Reg, src2: Reg) {
self.emit(Inst::XmmRmiRVex {
Expand All @@ -1641,6 +1652,27 @@ impl Assembler {
})
}

/// Add unsigned integers with unsigned saturation.
///
/// Adds the src operands but when an individual byte result is larger than
/// an unsigned byte integer, 0xFF is written instead.
pub fn xmm_vpaddusb_rrm(&mut self, dst: WritableReg, src1: Reg, src2: &Address) {
let src2 = Self::to_synthetic_amode(
src2,
&mut self.pool,
&mut self.constants,
&mut self.buffer,
MemFlags::trusted(),
);

self.emit(Inst::XmmRmiRVex {
op: args::AvxOpcode::Vpaddusb,
src1: src1.into(),
src2: XmmMemImm::unwrap_new(RegMemImm::mem(src2)),
dst: dst.to_reg().into(),
})
}

pub fn fence(&mut self, kind: FenceKind) {
self.emit(Inst::Fence { kind });
}
Expand Down
19 changes: 19 additions & 0 deletions winch/codegen/src/isa/x64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,25 @@ impl Masm for MacroAssembler {
Ok(())
}

fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
if !self.flags.has_avx() {
bail!(CodeGenError::UnimplementedForNoAvx)
}

// Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
// outside that range.
// Each lane is a signed byte so the maximum value is 0x7F. Adding
// 0x70 to any value higher than 0xF will saturate resulting in a value
// of 0xFF (i.e., 0).
let clamp = self.asm.add_constant(&[0x70; 16]);
self.asm.xmm_vpaddusb_rrm(writable!(rhs), rhs, &clamp);

// Don't need to subtract 0x70 since `vpshufb` uses the least
// significant 4 bits which are the same after adding 0x70.
self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
Ok(())
}

fn atomic_rmw(
&mut self,
addr: Self::Address,
Expand Down
3 changes: 3 additions & 0 deletions winch/codegen/src/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,9 @@ pub(crate) trait MacroAssembler {
/// using lanes as a mask to select which indexes to copy.
fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()>;

/// Performs a swizzle between two 128-bit vectors into a 128-bit result.
fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()>;

/// Performs the RMW `op` operation on the passed `addr`.
///
/// The value *before* the operation was performed is written back to the `operand` register.
Expand Down
11 changes: 11 additions & 0 deletions winch/codegen/src/visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ macro_rules! def_unsupported {
(emit I64AtomicRmw32AddU $($rest:tt)*) => {};
(emit I64AtomicRmwAdd $($rest:tt)*) => {};
(emit I8x16Shuffle $($rest:tt)*) => {};
(emit I8x16Swizzle $($rest:tt)*) => {};
(emit I32AtomicRmw8SubU $($rest:tt)*) => {};
(emit I32AtomicRmw16SubU $($rest:tt)*) => {};
(emit I32AtomicRmwSub $($rest:tt)*) => {};
Expand Down Expand Up @@ -2800,6 +2801,16 @@ where
Ok(())
}

fn visit_i8x16_swizzle(&mut self) -> Self::Output {
let rhs = self.context.pop_to_reg(self.masm, None)?;
let lhs = self.context.pop_to_reg(self.masm, None)?;
self.masm
.swizzle(writable!(lhs.into()), lhs.into(), rhs.into())?;
self.context.stack.push(TypedReg::v128(lhs.into()).into());
self.context.free_reg(rhs);
Ok(())
}

fn visit_i8x16_extract_lane_s(&mut self, lane: u8) -> Self::Output {
self.context.extract_lane_op(
self.masm,
Expand Down
Loading